diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
commit | 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch) | |
tree | a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ceph | |
parent | 406089d01562f1e2bf9f089fd7637009ebaad589 (diff) |
Patched in Tegra support.
Diffstat (limited to 'net/ceph')
-rw-r--r-- | net/ceph/Kconfig | 14 | ||||
-rw-r--r-- | net/ceph/auth_none.c | 15 | ||||
-rw-r--r-- | net/ceph/auth_x.c | 15 | ||||
-rw-r--r-- | net/ceph/auth_x.h | 6 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 81 | ||||
-rw-r--r-- | net/ceph/ceph_hash.c | 6 | ||||
-rw-r--r-- | net/ceph/crush/crush.c | 39 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 148 | ||||
-rw-r--r-- | net/ceph/crypto.c | 13 | ||||
-rw-r--r-- | net/ceph/crypto.h | 3 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 10 | ||||
-rw-r--r-- | net/ceph/messenger.c | 1688 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 204 | ||||
-rw-r--r-- | net/ceph/msgpool.c | 7 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 328 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 206 | ||||
-rw-r--r-- | net/ceph/pagelist.c | 19 |
17 files changed, 1167 insertions, 1635 deletions
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index cc04dd667a1..be683f2d401 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig | |||
@@ -27,17 +27,3 @@ config CEPH_LIB_PRETTYDEBUG | |||
27 | 27 | ||
28 | If unsure, say N. | 28 | If unsure, say N. |
29 | 29 | ||
30 | config CEPH_LIB_USE_DNS_RESOLVER | ||
31 | bool "Use in-kernel support for DNS lookup" | ||
32 | depends on CEPH_LIB | ||
33 | select DNS_RESOLVER | ||
34 | default n | ||
35 | help | ||
36 | If you say Y here, hostnames (e.g. monitor addresses) will | ||
37 | be resolved using the CONFIG_DNS_RESOLVER facility. | ||
38 | |||
39 | For information on how to use CONFIG_DNS_RESOLVER consult | ||
40 | Documentation/networking/dns_resolver.txt | ||
41 | |||
42 | If unsure, say N. | ||
43 | |||
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 925ca583c09..214c2bb43d6 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c | |||
@@ -59,7 +59,9 @@ static int handle_reply(struct ceph_auth_client *ac, int result, | |||
59 | */ | 59 | */ |
60 | static int ceph_auth_none_create_authorizer( | 60 | static int ceph_auth_none_create_authorizer( |
61 | struct ceph_auth_client *ac, int peer_type, | 61 | struct ceph_auth_client *ac, int peer_type, |
62 | struct ceph_auth_handshake *auth) | 62 | struct ceph_authorizer **a, |
63 | void **buf, size_t *len, | ||
64 | void **reply_buf, size_t *reply_len) | ||
63 | { | 65 | { |
64 | struct ceph_auth_none_info *ai = ac->private; | 66 | struct ceph_auth_none_info *ai = ac->private; |
65 | struct ceph_none_authorizer *au = &ai->au; | 67 | struct ceph_none_authorizer *au = &ai->au; |
@@ -80,12 +82,11 @@ static int ceph_auth_none_create_authorizer( | |||
80 | dout("built authorizer len %d\n", au->buf_len); | 82 | dout("built authorizer len %d\n", au->buf_len); |
81 | } | 83 | } |
82 | 84 | ||
83 | auth->authorizer = (struct ceph_authorizer *) au; | 85 | *a = (struct ceph_authorizer *)au; |
84 | auth->authorizer_buf = au->buf; | 86 | *buf = au->buf; |
85 | auth->authorizer_buf_len = au->buf_len; | 87 | *len = au->buf_len; |
86 | auth->authorizer_reply_buf = au->reply_buf; | 88 | *reply_buf = au->reply_buf; |
87 | auth->authorizer_reply_buf_len = sizeof (au->reply_buf); | 89 | *reply_len = sizeof(au->reply_buf); |
88 | |||
89 | return 0; | 90 | return 0; |
90 | 91 | ||
91 | bad2: | 92 | bad2: |
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a16bf14eb02..1587dc6010c 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c | |||
@@ -526,7 +526,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, | |||
526 | 526 | ||
527 | static int ceph_x_create_authorizer( | 527 | static int ceph_x_create_authorizer( |
528 | struct ceph_auth_client *ac, int peer_type, | 528 | struct ceph_auth_client *ac, int peer_type, |
529 | struct ceph_auth_handshake *auth) | 529 | struct ceph_authorizer **a, |
530 | void **buf, size_t *len, | ||
531 | void **reply_buf, size_t *reply_len) | ||
530 | { | 532 | { |
531 | struct ceph_x_authorizer *au; | 533 | struct ceph_x_authorizer *au; |
532 | struct ceph_x_ticket_handler *th; | 534 | struct ceph_x_ticket_handler *th; |
@@ -546,12 +548,11 @@ static int ceph_x_create_authorizer( | |||
546 | return ret; | 548 | return ret; |
547 | } | 549 | } |
548 | 550 | ||
549 | auth->authorizer = (struct ceph_authorizer *) au; | 551 | *a = (struct ceph_authorizer *)au; |
550 | auth->authorizer_buf = au->buf->vec.iov_base; | 552 | *buf = au->buf->vec.iov_base; |
551 | auth->authorizer_buf_len = au->buf->vec.iov_len; | 553 | *len = au->buf->vec.iov_len; |
552 | auth->authorizer_reply_buf = au->reply_buf; | 554 | *reply_buf = au->reply_buf; |
553 | auth->authorizer_reply_buf_len = sizeof (au->reply_buf); | 555 | *reply_len = sizeof(au->reply_buf); |
554 | |||
555 | return 0; | 556 | return 0; |
556 | } | 557 | } |
557 | 558 | ||
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index f459e93b774..e02da7a5c5a 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h | |||
@@ -13,7 +13,7 @@ | |||
13 | */ | 13 | */ |
14 | struct ceph_x_ticket_handler { | 14 | struct ceph_x_ticket_handler { |
15 | struct rb_node node; | 15 | struct rb_node node; |
16 | unsigned int service; | 16 | unsigned service; |
17 | 17 | ||
18 | struct ceph_crypto_key session_key; | 18 | struct ceph_crypto_key session_key; |
19 | struct ceph_timespec validity; | 19 | struct ceph_timespec validity; |
@@ -27,7 +27,7 @@ struct ceph_x_ticket_handler { | |||
27 | 27 | ||
28 | struct ceph_x_authorizer { | 28 | struct ceph_x_authorizer { |
29 | struct ceph_buffer *buf; | 29 | struct ceph_buffer *buf; |
30 | unsigned int service; | 30 | unsigned service; |
31 | u64 nonce; | 31 | u64 nonce; |
32 | char reply_buf[128]; /* big enough for encrypted blob */ | 32 | char reply_buf[128]; /* big enough for encrypted blob */ |
33 | }; | 33 | }; |
@@ -38,7 +38,7 @@ struct ceph_x_info { | |||
38 | bool starting; | 38 | bool starting; |
39 | u64 server_challenge; | 39 | u64 server_challenge; |
40 | 40 | ||
41 | unsigned int have_keys; | 41 | unsigned have_keys; |
42 | struct rb_root ticket_handlers; | 42 | struct rb_root ticket_handlers; |
43 | 43 | ||
44 | struct ceph_x_authorizer auth_authorizer; | 44 | struct ceph_x_authorizer auth_authorizer; |
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ee71ea26777..2883ea01e68 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/string.h> | 17 | #include <linux/string.h> |
18 | 18 | ||
19 | 19 | ||
20 | #include <linux/ceph/ceph_features.h> | ||
21 | #include <linux/ceph/libceph.h> | 20 | #include <linux/ceph/libceph.h> |
22 | #include <linux/ceph/debugfs.h> | 21 | #include <linux/ceph/debugfs.h> |
23 | #include <linux/ceph/decode.h> | 22 | #include <linux/ceph/decode.h> |
@@ -84,7 +83,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) | |||
84 | return -1; | 83 | return -1; |
85 | } | 84 | } |
86 | } else { | 85 | } else { |
86 | pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); | ||
87 | memcpy(&client->fsid, fsid, sizeof(*fsid)); | 87 | memcpy(&client->fsid, fsid, sizeof(*fsid)); |
88 | ceph_debugfs_client_init(client); | ||
89 | client->have_fsid = true; | ||
88 | } | 90 | } |
89 | return 0; | 91 | return 0; |
90 | } | 92 | } |
@@ -201,9 +203,7 @@ enum { | |||
201 | Opt_ip, | 203 | Opt_ip, |
202 | Opt_last_string, | 204 | Opt_last_string, |
203 | /* string args above */ | 205 | /* string args above */ |
204 | Opt_share, | ||
205 | Opt_noshare, | 206 | Opt_noshare, |
206 | Opt_crc, | ||
207 | Opt_nocrc, | 207 | Opt_nocrc, |
208 | }; | 208 | }; |
209 | 209 | ||
@@ -219,9 +219,7 @@ static match_table_t opt_tokens = { | |||
219 | {Opt_key, "key=%s"}, | 219 | {Opt_key, "key=%s"}, |
220 | {Opt_ip, "ip=%s"}, | 220 | {Opt_ip, "ip=%s"}, |
221 | /* string args above */ | 221 | /* string args above */ |
222 | {Opt_share, "share"}, | ||
223 | {Opt_noshare, "noshare"}, | 222 | {Opt_noshare, "noshare"}, |
224 | {Opt_crc, "crc"}, | ||
225 | {Opt_nocrc, "nocrc"}, | 223 | {Opt_nocrc, "nocrc"}, |
226 | {-1, NULL} | 224 | {-1, NULL} |
227 | }; | 225 | }; |
@@ -281,11 +279,10 @@ out: | |||
281 | return err; | 279 | return err; |
282 | } | 280 | } |
283 | 281 | ||
284 | struct ceph_options * | 282 | int ceph_parse_options(struct ceph_options **popt, char *options, |
285 | ceph_parse_options(char *options, const char *dev_name, | 283 | const char *dev_name, const char *dev_name_end, |
286 | const char *dev_name_end, | 284 | int (*parse_extra_token)(char *c, void *private), |
287 | int (*parse_extra_token)(char *c, void *private), | 285 | void *private) |
288 | void *private) | ||
289 | { | 286 | { |
290 | struct ceph_options *opt; | 287 | struct ceph_options *opt; |
291 | const char *c; | 288 | const char *c; |
@@ -294,7 +291,7 @@ ceph_parse_options(char *options, const char *dev_name, | |||
294 | 291 | ||
295 | opt = kzalloc(sizeof(*opt), GFP_KERNEL); | 292 | opt = kzalloc(sizeof(*opt), GFP_KERNEL); |
296 | if (!opt) | 293 | if (!opt) |
297 | return ERR_PTR(-ENOMEM); | 294 | return err; |
298 | opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), | 295 | opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), |
299 | GFP_KERNEL); | 296 | GFP_KERNEL); |
300 | if (!opt->mon_addr) | 297 | if (!opt->mon_addr) |
@@ -305,6 +302,7 @@ ceph_parse_options(char *options, const char *dev_name, | |||
305 | 302 | ||
306 | /* start with defaults */ | 303 | /* start with defaults */ |
307 | opt->flags = CEPH_OPT_DEFAULT; | 304 | opt->flags = CEPH_OPT_DEFAULT; |
305 | opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; | ||
308 | opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; | 306 | opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; |
309 | opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ | 307 | opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ |
310 | opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ | 308 | opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ |
@@ -390,7 +388,7 @@ ceph_parse_options(char *options, const char *dev_name, | |||
390 | 388 | ||
391 | /* misc */ | 389 | /* misc */ |
392 | case Opt_osdtimeout: | 390 | case Opt_osdtimeout: |
393 | pr_warning("ignoring deprecated osdtimeout option\n"); | 391 | opt->osd_timeout = intval; |
394 | break; | 392 | break; |
395 | case Opt_osdkeepalivetimeout: | 393 | case Opt_osdkeepalivetimeout: |
396 | opt->osd_keepalive_timeout = intval; | 394 | opt->osd_keepalive_timeout = intval; |
@@ -402,16 +400,10 @@ ceph_parse_options(char *options, const char *dev_name, | |||
402 | opt->mount_timeout = intval; | 400 | opt->mount_timeout = intval; |
403 | break; | 401 | break; |
404 | 402 | ||
405 | case Opt_share: | ||
406 | opt->flags &= ~CEPH_OPT_NOSHARE; | ||
407 | break; | ||
408 | case Opt_noshare: | 403 | case Opt_noshare: |
409 | opt->flags |= CEPH_OPT_NOSHARE; | 404 | opt->flags |= CEPH_OPT_NOSHARE; |
410 | break; | 405 | break; |
411 | 406 | ||
412 | case Opt_crc: | ||
413 | opt->flags &= ~CEPH_OPT_NOCRC; | ||
414 | break; | ||
415 | case Opt_nocrc: | 407 | case Opt_nocrc: |
416 | opt->flags |= CEPH_OPT_NOCRC; | 408 | opt->flags |= CEPH_OPT_NOCRC; |
417 | break; | 409 | break; |
@@ -422,11 +414,12 @@ ceph_parse_options(char *options, const char *dev_name, | |||
422 | } | 414 | } |
423 | 415 | ||
424 | /* success */ | 416 | /* success */ |
425 | return opt; | 417 | *popt = opt; |
418 | return 0; | ||
426 | 419 | ||
427 | out: | 420 | out: |
428 | ceph_destroy_options(opt); | 421 | ceph_destroy_options(opt); |
429 | return ERR_PTR(err); | 422 | return err; |
430 | } | 423 | } |
431 | EXPORT_SYMBOL(ceph_parse_options); | 424 | EXPORT_SYMBOL(ceph_parse_options); |
432 | 425 | ||
@@ -439,12 +432,9 @@ EXPORT_SYMBOL(ceph_client_id); | |||
439 | /* | 432 | /* |
440 | * create a fresh client instance | 433 | * create a fresh client instance |
441 | */ | 434 | */ |
442 | struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, | 435 | struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) |
443 | unsigned int supported_features, | ||
444 | unsigned int required_features) | ||
445 | { | 436 | { |
446 | struct ceph_client *client; | 437 | struct ceph_client *client; |
447 | struct ceph_entity_addr *myaddr = NULL; | ||
448 | int err = -ENOMEM; | 438 | int err = -ENOMEM; |
449 | 439 | ||
450 | client = kzalloc(sizeof(*client), GFP_KERNEL); | 440 | client = kzalloc(sizeof(*client), GFP_KERNEL); |
@@ -459,18 +449,10 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, | |||
459 | client->auth_err = 0; | 449 | client->auth_err = 0; |
460 | 450 | ||
461 | client->extra_mon_dispatch = NULL; | 451 | client->extra_mon_dispatch = NULL; |
462 | client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | | 452 | client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; |
463 | supported_features; | 453 | client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; |
464 | client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | | 454 | |
465 | required_features; | 455 | client->msgr = NULL; |
466 | |||
467 | /* msgr */ | ||
468 | if (ceph_test_opt(client, MYIP)) | ||
469 | myaddr = &client->options->my_addr; | ||
470 | ceph_messenger_init(&client->msgr, myaddr, | ||
471 | client->supported_features, | ||
472 | client->required_features, | ||
473 | ceph_test_opt(client, NOCRC)); | ||
474 | 456 | ||
475 | /* subsystems */ | 457 | /* subsystems */ |
476 | err = ceph_monc_init(&client->monc, client); | 458 | err = ceph_monc_init(&client->monc, client); |
@@ -494,15 +476,23 @@ void ceph_destroy_client(struct ceph_client *client) | |||
494 | { | 476 | { |
495 | dout("destroy_client %p\n", client); | 477 | dout("destroy_client %p\n", client); |
496 | 478 | ||
497 | atomic_set(&client->msgr.stopping, 1); | ||
498 | |||
499 | /* unmount */ | 479 | /* unmount */ |
500 | ceph_osdc_stop(&client->osdc); | 480 | ceph_osdc_stop(&client->osdc); |
501 | 481 | ||
482 | /* | ||
483 | * make sure osd connections close out before destroying the | ||
484 | * auth module, which is needed to free those connections' | ||
485 | * ceph_authorizers. | ||
486 | */ | ||
487 | ceph_msgr_flush(); | ||
488 | |||
502 | ceph_monc_stop(&client->monc); | 489 | ceph_monc_stop(&client->monc); |
503 | 490 | ||
504 | ceph_debugfs_client_cleanup(client); | 491 | ceph_debugfs_client_cleanup(client); |
505 | 492 | ||
493 | if (client->msgr) | ||
494 | ceph_messenger_destroy(client->msgr); | ||
495 | |||
506 | ceph_destroy_options(client->options); | 496 | ceph_destroy_options(client->options); |
507 | 497 | ||
508 | kfree(client); | 498 | kfree(client); |
@@ -524,9 +514,24 @@ static int have_mon_and_osd_map(struct ceph_client *client) | |||
524 | */ | 514 | */ |
525 | int __ceph_open_session(struct ceph_client *client, unsigned long started) | 515 | int __ceph_open_session(struct ceph_client *client, unsigned long started) |
526 | { | 516 | { |
517 | struct ceph_entity_addr *myaddr = NULL; | ||
527 | int err; | 518 | int err; |
528 | unsigned long timeout = client->options->mount_timeout * HZ; | 519 | unsigned long timeout = client->options->mount_timeout * HZ; |
529 | 520 | ||
521 | /* initialize the messenger */ | ||
522 | if (client->msgr == NULL) { | ||
523 | if (ceph_test_opt(client, MYIP)) | ||
524 | myaddr = &client->options->my_addr; | ||
525 | client->msgr = ceph_messenger_create(myaddr, | ||
526 | client->supported_features, | ||
527 | client->required_features); | ||
528 | if (IS_ERR(client->msgr)) { | ||
529 | client->msgr = NULL; | ||
530 | return PTR_ERR(client->msgr); | ||
531 | } | ||
532 | client->msgr->nocrc = ceph_test_opt(client, NOCRC); | ||
533 | } | ||
534 | |||
530 | /* open session, and wait for mon and osd maps */ | 535 | /* open session, and wait for mon and osd maps */ |
531 | err = ceph_monc_open_session(&client->monc); | 536 | err = ceph_monc_open_session(&client->monc); |
532 | if (err < 0) | 537 | if (err < 0) |
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 67bb1f11e61..0a1b53bce76 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c | |||
@@ -20,7 +20,7 @@ | |||
20 | c = c - a; c = c - b; c = c ^ (b >> 15); \ | 20 | c = c - a; c = c - b; c = c ^ (b >> 15); \ |
21 | } while (0) | 21 | } while (0) |
22 | 22 | ||
23 | unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) | 23 | unsigned ceph_str_hash_rjenkins(const char *str, unsigned length) |
24 | { | 24 | { |
25 | const unsigned char *k = (const unsigned char *)str; | 25 | const unsigned char *k = (const unsigned char *)str; |
26 | __u32 a, b, c; /* the internal state */ | 26 | __u32 a, b, c; /* the internal state */ |
@@ -81,7 +81,7 @@ unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) | |||
81 | /* | 81 | /* |
82 | * linux dcache hash | 82 | * linux dcache hash |
83 | */ | 83 | */ |
84 | unsigned int ceph_str_hash_linux(const char *str, unsigned int length) | 84 | unsigned ceph_str_hash_linux(const char *str, unsigned length) |
85 | { | 85 | { |
86 | unsigned long hash = 0; | 86 | unsigned long hash = 0; |
87 | unsigned char c; | 87 | unsigned char c; |
@@ -94,7 +94,7 @@ unsigned int ceph_str_hash_linux(const char *str, unsigned int length) | |||
94 | } | 94 | } |
95 | 95 | ||
96 | 96 | ||
97 | unsigned int ceph_str_hash(int type, const char *s, unsigned int len) | 97 | unsigned ceph_str_hash(int type, const char *s, unsigned len) |
98 | { | 98 | { |
99 | switch (type) { | 99 | switch (type) { |
100 | case CEPH_STR_HASH_LINUX: | 100 | case CEPH_STR_HASH_LINUX: |
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 089613234f0..d6ebb13a18a 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c | |||
@@ -26,9 +26,9 @@ const char *crush_bucket_alg_name(int alg) | |||
26 | * @b: bucket pointer | 26 | * @b: bucket pointer |
27 | * @p: item index in bucket | 27 | * @p: item index in bucket |
28 | */ | 28 | */ |
29 | int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) | 29 | int crush_get_bucket_item_weight(struct crush_bucket *b, int p) |
30 | { | 30 | { |
31 | if ((__u32)p >= b->size) | 31 | if (p >= b->size) |
32 | return 0; | 32 | return 0; |
33 | 33 | ||
34 | switch (b->alg) { | 34 | switch (b->alg) { |
@@ -37,13 +37,38 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) | |||
37 | case CRUSH_BUCKET_LIST: | 37 | case CRUSH_BUCKET_LIST: |
38 | return ((struct crush_bucket_list *)b)->item_weights[p]; | 38 | return ((struct crush_bucket_list *)b)->item_weights[p]; |
39 | case CRUSH_BUCKET_TREE: | 39 | case CRUSH_BUCKET_TREE: |
40 | return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; | 40 | if (p & 1) |
41 | return ((struct crush_bucket_tree *)b)->node_weights[p]; | ||
42 | return 0; | ||
41 | case CRUSH_BUCKET_STRAW: | 43 | case CRUSH_BUCKET_STRAW: |
42 | return ((struct crush_bucket_straw *)b)->item_weights[p]; | 44 | return ((struct crush_bucket_straw *)b)->item_weights[p]; |
43 | } | 45 | } |
44 | return 0; | 46 | return 0; |
45 | } | 47 | } |
46 | 48 | ||
49 | /** | ||
50 | * crush_calc_parents - Calculate parent vectors for the given crush map. | ||
51 | * @map: crush_map pointer | ||
52 | */ | ||
53 | void crush_calc_parents(struct crush_map *map) | ||
54 | { | ||
55 | int i, b, c; | ||
56 | |||
57 | for (b = 0; b < map->max_buckets; b++) { | ||
58 | if (map->buckets[b] == NULL) | ||
59 | continue; | ||
60 | for (i = 0; i < map->buckets[b]->size; i++) { | ||
61 | c = map->buckets[b]->items[i]; | ||
62 | BUG_ON(c >= map->max_devices || | ||
63 | c < -map->max_buckets); | ||
64 | if (c >= 0) | ||
65 | map->device_parents[c] = map->buckets[b]->id; | ||
66 | else | ||
67 | map->bucket_parents[-1-c] = map->buckets[b]->id; | ||
68 | } | ||
69 | } | ||
70 | } | ||
71 | |||
47 | void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) | 72 | void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) |
48 | { | 73 | { |
49 | kfree(b->h.perm); | 74 | kfree(b->h.perm); |
@@ -62,8 +87,6 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) | |||
62 | 87 | ||
63 | void crush_destroy_bucket_tree(struct crush_bucket_tree *b) | 88 | void crush_destroy_bucket_tree(struct crush_bucket_tree *b) |
64 | { | 89 | { |
65 | kfree(b->h.perm); | ||
66 | kfree(b->h.items); | ||
67 | kfree(b->node_weights); | 90 | kfree(b->node_weights); |
68 | kfree(b); | 91 | kfree(b); |
69 | } | 92 | } |
@@ -101,9 +124,10 @@ void crush_destroy_bucket(struct crush_bucket *b) | |||
101 | */ | 124 | */ |
102 | void crush_destroy(struct crush_map *map) | 125 | void crush_destroy(struct crush_map *map) |
103 | { | 126 | { |
127 | int b; | ||
128 | |||
104 | /* buckets */ | 129 | /* buckets */ |
105 | if (map->buckets) { | 130 | if (map->buckets) { |
106 | __s32 b; | ||
107 | for (b = 0; b < map->max_buckets; b++) { | 131 | for (b = 0; b < map->max_buckets; b++) { |
108 | if (map->buckets[b] == NULL) | 132 | if (map->buckets[b] == NULL) |
109 | continue; | 133 | continue; |
@@ -114,12 +138,13 @@ void crush_destroy(struct crush_map *map) | |||
114 | 138 | ||
115 | /* rules */ | 139 | /* rules */ |
116 | if (map->rules) { | 140 | if (map->rules) { |
117 | __u32 b; | ||
118 | for (b = 0; b < map->max_rules; b++) | 141 | for (b = 0; b < map->max_rules; b++) |
119 | kfree(map->rules[b]); | 142 | kfree(map->rules[b]); |
120 | kfree(map->rules); | 143 | kfree(map->rules); |
121 | } | 144 | } |
122 | 145 | ||
146 | kfree(map->bucket_parents); | ||
147 | kfree(map->device_parents); | ||
123 | kfree(map); | 148 | kfree(map); |
124 | } | 149 | } |
125 | 150 | ||
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 35fce755ce1..42599e31dca 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -20,7 +20,6 @@ | |||
20 | 20 | ||
21 | #include <linux/crush/crush.h> | 21 | #include <linux/crush/crush.h> |
22 | #include <linux/crush/hash.h> | 22 | #include <linux/crush/hash.h> |
23 | #include <linux/crush/mapper.h> | ||
24 | 23 | ||
25 | /* | 24 | /* |
26 | * Implement the core CRUSH mapping algorithm. | 25 | * Implement the core CRUSH mapping algorithm. |
@@ -33,9 +32,9 @@ | |||
33 | * @type: storage ruleset type (user defined) | 32 | * @type: storage ruleset type (user defined) |
34 | * @size: output set size | 33 | * @size: output set size |
35 | */ | 34 | */ |
36 | int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) | 35 | int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) |
37 | { | 36 | { |
38 | __u32 i; | 37 | int i; |
39 | 38 | ||
40 | for (i = 0; i < map->max_rules; i++) { | 39 | for (i = 0; i < map->max_rules; i++) { |
41 | if (map->rules[i] && | 40 | if (map->rules[i] && |
@@ -69,11 +68,11 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size | |||
69 | static int bucket_perm_choose(struct crush_bucket *bucket, | 68 | static int bucket_perm_choose(struct crush_bucket *bucket, |
70 | int x, int r) | 69 | int x, int r) |
71 | { | 70 | { |
72 | unsigned int pr = r % bucket->size; | 71 | unsigned pr = r % bucket->size; |
73 | unsigned int i, s; | 72 | unsigned i, s; |
74 | 73 | ||
75 | /* start a new permutation if @x has changed */ | 74 | /* start a new permutation if @x has changed */ |
76 | if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { | 75 | if (bucket->perm_x != x || bucket->perm_n == 0) { |
77 | dprintk("bucket %d new x=%d\n", bucket->id, x); | 76 | dprintk("bucket %d new x=%d\n", bucket->id, x); |
78 | bucket->perm_x = x; | 77 | bucket->perm_x = x; |
79 | 78 | ||
@@ -101,13 +100,13 @@ static int bucket_perm_choose(struct crush_bucket *bucket, | |||
101 | for (i = 0; i < bucket->perm_n; i++) | 100 | for (i = 0; i < bucket->perm_n; i++) |
102 | dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); | 101 | dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); |
103 | while (bucket->perm_n <= pr) { | 102 | while (bucket->perm_n <= pr) { |
104 | unsigned int p = bucket->perm_n; | 103 | unsigned p = bucket->perm_n; |
105 | /* no point in swapping the final entry */ | 104 | /* no point in swapping the final entry */ |
106 | if (p < bucket->size - 1) { | 105 | if (p < bucket->size - 1) { |
107 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % | 106 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % |
108 | (bucket->size - p); | 107 | (bucket->size - p); |
109 | if (i) { | 108 | if (i) { |
110 | unsigned int t = bucket->perm[p + i]; | 109 | unsigned t = bucket->perm[p + i]; |
111 | bucket->perm[p + i] = bucket->perm[p]; | 110 | bucket->perm[p + i] = bucket->perm[p]; |
112 | bucket->perm[p] = t; | 111 | bucket->perm[p] = t; |
113 | } | 112 | } |
@@ -153,8 +152,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, | |||
153 | return bucket->h.items[i]; | 152 | return bucket->h.items[i]; |
154 | } | 153 | } |
155 | 154 | ||
156 | dprintk("bad list sums for bucket %d\n", bucket->h.id); | 155 | BUG_ON(1); |
157 | return bucket->h.items[0]; | 156 | return 0; |
158 | } | 157 | } |
159 | 158 | ||
160 | 159 | ||
@@ -220,7 +219,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, | |||
220 | static int bucket_straw_choose(struct crush_bucket_straw *bucket, | 219 | static int bucket_straw_choose(struct crush_bucket_straw *bucket, |
221 | int x, int r) | 220 | int x, int r) |
222 | { | 221 | { |
223 | __u32 i; | 222 | int i; |
224 | int high = 0; | 223 | int high = 0; |
225 | __u64 high_draw = 0; | 224 | __u64 high_draw = 0; |
226 | __u64 draw; | 225 | __u64 draw; |
@@ -240,7 +239,6 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, | |||
240 | static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | 239 | static int crush_bucket_choose(struct crush_bucket *in, int x, int r) |
241 | { | 240 | { |
242 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); | 241 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); |
243 | BUG_ON(in->size == 0); | ||
244 | switch (in->alg) { | 242 | switch (in->alg) { |
245 | case CRUSH_BUCKET_UNIFORM: | 243 | case CRUSH_BUCKET_UNIFORM: |
246 | return bucket_uniform_choose((struct crush_bucket_uniform *)in, | 244 | return bucket_uniform_choose((struct crush_bucket_uniform *)in, |
@@ -255,7 +253,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | |||
255 | return bucket_straw_choose((struct crush_bucket_straw *)in, | 253 | return bucket_straw_choose((struct crush_bucket_straw *)in, |
256 | x, r); | 254 | x, r); |
257 | default: | 255 | default: |
258 | dprintk("unknown bucket %d alg %d\n", in->id, in->alg); | 256 | BUG_ON(1); |
259 | return in->items[0]; | 257 | return in->items[0]; |
260 | } | 258 | } |
261 | } | 259 | } |
@@ -264,7 +262,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | |||
264 | * true if device is marked "out" (failed, fully offloaded) | 262 | * true if device is marked "out" (failed, fully offloaded) |
265 | * of the cluster | 263 | * of the cluster |
266 | */ | 264 | */ |
267 | static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) | 265 | static int is_out(struct crush_map *map, __u32 *weight, int item, int x) |
268 | { | 266 | { |
269 | if (weight[item] >= 0x10000) | 267 | if (weight[item] >= 0x10000) |
270 | return 0; | 268 | return 0; |
@@ -289,16 +287,16 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in | |||
289 | * @recurse_to_leaf: true if we want one device under each item of given type | 287 | * @recurse_to_leaf: true if we want one device under each item of given type |
290 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | 288 | * @out2: second output vector for leaf items (if @recurse_to_leaf) |
291 | */ | 289 | */ |
292 | static int crush_choose(const struct crush_map *map, | 290 | static int crush_choose(struct crush_map *map, |
293 | struct crush_bucket *bucket, | 291 | struct crush_bucket *bucket, |
294 | const __u32 *weight, | 292 | __u32 *weight, |
295 | int x, int numrep, int type, | 293 | int x, int numrep, int type, |
296 | int *out, int outpos, | 294 | int *out, int outpos, |
297 | int firstn, int recurse_to_leaf, | 295 | int firstn, int recurse_to_leaf, |
298 | int *out2) | 296 | int *out2) |
299 | { | 297 | { |
300 | int rep; | 298 | int rep; |
301 | unsigned int ftotal, flocal; | 299 | int ftotal, flocal; |
302 | int retry_descent, retry_bucket, skip_rep; | 300 | int retry_descent, retry_bucket, skip_rep; |
303 | struct crush_bucket *in = bucket; | 301 | struct crush_bucket *in = bucket; |
304 | int r; | 302 | int r; |
@@ -306,6 +304,7 @@ static int crush_choose(const struct crush_map *map, | |||
306 | int item = 0; | 304 | int item = 0; |
307 | int itemtype; | 305 | int itemtype; |
308 | int collide, reject; | 306 | int collide, reject; |
307 | const int orig_tries = 5; /* attempts before we fall back to search */ | ||
309 | 308 | ||
310 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", | 309 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", |
311 | bucket->id, x, outpos, numrep); | 310 | bucket->id, x, outpos, numrep); |
@@ -326,7 +325,7 @@ static int crush_choose(const struct crush_map *map, | |||
326 | r = rep; | 325 | r = rep; |
327 | if (in->alg == CRUSH_BUCKET_UNIFORM) { | 326 | if (in->alg == CRUSH_BUCKET_UNIFORM) { |
328 | /* be careful */ | 327 | /* be careful */ |
329 | if (firstn || (__u32)numrep >= in->size) | 328 | if (firstn || numrep >= in->size) |
330 | /* r' = r + f_total */ | 329 | /* r' = r + f_total */ |
331 | r += ftotal; | 330 | r += ftotal; |
332 | else if (in->size % numrep == 0) | 331 | else if (in->size % numrep == 0) |
@@ -350,17 +349,12 @@ static int crush_choose(const struct crush_map *map, | |||
350 | reject = 1; | 349 | reject = 1; |
351 | goto reject; | 350 | goto reject; |
352 | } | 351 | } |
353 | if (map->choose_local_fallback_tries > 0 && | 352 | if (flocal >= (in->size>>1) && |
354 | flocal >= (in->size>>1) && | 353 | flocal > orig_tries) |
355 | flocal > map->choose_local_fallback_tries) | ||
356 | item = bucket_perm_choose(in, x, r); | 354 | item = bucket_perm_choose(in, x, r); |
357 | else | 355 | else |
358 | item = crush_bucket_choose(in, x, r); | 356 | item = crush_bucket_choose(in, x, r); |
359 | if (item >= map->max_devices) { | 357 | BUG_ON(item >= map->max_devices); |
360 | dprintk(" bad item %d\n", item); | ||
361 | skip_rep = 1; | ||
362 | break; | ||
363 | } | ||
364 | 358 | ||
365 | /* desired type? */ | 359 | /* desired type? */ |
366 | if (item < 0) | 360 | if (item < 0) |
@@ -371,12 +365,8 @@ static int crush_choose(const struct crush_map *map, | |||
371 | 365 | ||
372 | /* keep going? */ | 366 | /* keep going? */ |
373 | if (itemtype != type) { | 367 | if (itemtype != type) { |
374 | if (item >= 0 || | 368 | BUG_ON(item >= 0 || |
375 | (-1-item) >= map->max_buckets) { | 369 | (-1-item) >= map->max_buckets); |
376 | dprintk(" bad item type %d\n", type); | ||
377 | skip_rep = 1; | ||
378 | break; | ||
379 | } | ||
380 | in = map->buckets[-1-item]; | 370 | in = map->buckets[-1-item]; |
381 | retry_bucket = 1; | 371 | retry_bucket = 1; |
382 | continue; | 372 | continue; |
@@ -422,21 +412,20 @@ reject: | |||
422 | ftotal++; | 412 | ftotal++; |
423 | flocal++; | 413 | flocal++; |
424 | 414 | ||
425 | if (collide && flocal <= map->choose_local_tries) | 415 | if (collide && flocal < 3) |
426 | /* retry locally a few times */ | 416 | /* retry locally a few times */ |
427 | retry_bucket = 1; | 417 | retry_bucket = 1; |
428 | else if (map->choose_local_fallback_tries > 0 && | 418 | else if (flocal < in->size + orig_tries) |
429 | flocal <= in->size + map->choose_local_fallback_tries) | ||
430 | /* exhaustive bucket search */ | 419 | /* exhaustive bucket search */ |
431 | retry_bucket = 1; | 420 | retry_bucket = 1; |
432 | else if (ftotal <= map->choose_total_tries) | 421 | else if (ftotal < 20) |
433 | /* then retry descent */ | 422 | /* then retry descent */ |
434 | retry_descent = 1; | 423 | retry_descent = 1; |
435 | else | 424 | else |
436 | /* else give up */ | 425 | /* else give up */ |
437 | skip_rep = 1; | 426 | skip_rep = 1; |
438 | dprintk(" reject %d collide %d " | 427 | dprintk(" reject %d collide %d " |
439 | "ftotal %u flocal %u\n", | 428 | "ftotal %d flocal %d\n", |
440 | reject, collide, ftotal, | 429 | reject, collide, ftotal, |
441 | flocal); | 430 | flocal); |
442 | } | 431 | } |
@@ -465,12 +454,15 @@ reject: | |||
465 | * @x: hash input | 454 | * @x: hash input |
466 | * @result: pointer to result vector | 455 | * @result: pointer to result vector |
467 | * @result_max: maximum result size | 456 | * @result_max: maximum result size |
457 | * @force: force initial replica choice; -1 for none | ||
468 | */ | 458 | */ |
469 | int crush_do_rule(const struct crush_map *map, | 459 | int crush_do_rule(struct crush_map *map, |
470 | int ruleno, int x, int *result, int result_max, | 460 | int ruleno, int x, int *result, int result_max, |
471 | const __u32 *weight) | 461 | int force, __u32 *weight) |
472 | { | 462 | { |
473 | int result_len; | 463 | int result_len; |
464 | int force_context[CRUSH_MAX_DEPTH]; | ||
465 | int force_pos = -1; | ||
474 | int a[CRUSH_MAX_SET]; | 466 | int a[CRUSH_MAX_SET]; |
475 | int b[CRUSH_MAX_SET]; | 467 | int b[CRUSH_MAX_SET]; |
476 | int c[CRUSH_MAX_SET]; | 468 | int c[CRUSH_MAX_SET]; |
@@ -481,44 +473,67 @@ int crush_do_rule(const struct crush_map *map, | |||
481 | int osize; | 473 | int osize; |
482 | int *tmp; | 474 | int *tmp; |
483 | struct crush_rule *rule; | 475 | struct crush_rule *rule; |
484 | __u32 step; | 476 | int step; |
485 | int i, j; | 477 | int i, j; |
486 | int numrep; | 478 | int numrep; |
487 | int firstn; | 479 | int firstn; |
480 | int rc = -1; | ||
488 | 481 | ||
489 | if ((__u32)ruleno >= map->max_rules) { | 482 | BUG_ON(ruleno >= map->max_rules); |
490 | dprintk(" bad ruleno %d\n", ruleno); | ||
491 | return 0; | ||
492 | } | ||
493 | 483 | ||
494 | rule = map->rules[ruleno]; | 484 | rule = map->rules[ruleno]; |
495 | result_len = 0; | 485 | result_len = 0; |
496 | w = a; | 486 | w = a; |
497 | o = b; | 487 | o = b; |
498 | 488 | ||
499 | for (step = 0; step < rule->len; step++) { | 489 | /* |
500 | struct crush_rule_step *curstep = &rule->steps[step]; | 490 | * determine hierarchical context of force, if any. note |
491 | * that this may or may not correspond to the specific types | ||
492 | * referenced by the crush rule. | ||
493 | */ | ||
494 | if (force >= 0) { | ||
495 | if (force >= map->max_devices || | ||
496 | map->device_parents[force] == 0) { | ||
497 | /*dprintk("CRUSH: forcefed device dne\n");*/ | ||
498 | rc = -1; /* force fed device dne */ | ||
499 | goto out; | ||
500 | } | ||
501 | if (!is_out(map, weight, force, x)) { | ||
502 | while (1) { | ||
503 | force_context[++force_pos] = force; | ||
504 | if (force >= 0) | ||
505 | force = map->device_parents[force]; | ||
506 | else | ||
507 | force = map->bucket_parents[-1-force]; | ||
508 | if (force == 0) | ||
509 | break; | ||
510 | } | ||
511 | } | ||
512 | } | ||
501 | 513 | ||
514 | for (step = 0; step < rule->len; step++) { | ||
502 | firstn = 0; | 515 | firstn = 0; |
503 | switch (curstep->op) { | 516 | switch (rule->steps[step].op) { |
504 | case CRUSH_RULE_TAKE: | 517 | case CRUSH_RULE_TAKE: |
505 | w[0] = curstep->arg1; | 518 | w[0] = rule->steps[step].arg1; |
519 | if (force_pos >= 0) { | ||
520 | BUG_ON(force_context[force_pos] != w[0]); | ||
521 | force_pos--; | ||
522 | } | ||
506 | wsize = 1; | 523 | wsize = 1; |
507 | break; | 524 | break; |
508 | 525 | ||
509 | case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: | 526 | case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: |
510 | case CRUSH_RULE_CHOOSE_FIRSTN: | 527 | case CRUSH_RULE_CHOOSE_FIRSTN: |
511 | firstn = 1; | 528 | firstn = 1; |
512 | /* fall through */ | ||
513 | case CRUSH_RULE_CHOOSE_LEAF_INDEP: | 529 | case CRUSH_RULE_CHOOSE_LEAF_INDEP: |
514 | case CRUSH_RULE_CHOOSE_INDEP: | 530 | case CRUSH_RULE_CHOOSE_INDEP: |
515 | if (wsize == 0) | 531 | BUG_ON(wsize == 0); |
516 | break; | ||
517 | 532 | ||
518 | recurse_to_leaf = | 533 | recurse_to_leaf = |
519 | curstep->op == | 534 | rule->steps[step].op == |
520 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN || | 535 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN || |
521 | curstep->op == | 536 | rule->steps[step].op == |
522 | CRUSH_RULE_CHOOSE_LEAF_INDEP; | 537 | CRUSH_RULE_CHOOSE_LEAF_INDEP; |
523 | 538 | ||
524 | /* reset output */ | 539 | /* reset output */ |
@@ -530,18 +545,32 @@ int crush_do_rule(const struct crush_map *map, | |||
530 | * basically, numrep <= 0 means relative to | 545 | * basically, numrep <= 0 means relative to |
531 | * the provided result_max | 546 | * the provided result_max |
532 | */ | 547 | */ |
533 | numrep = curstep->arg1; | 548 | numrep = rule->steps[step].arg1; |
534 | if (numrep <= 0) { | 549 | if (numrep <= 0) { |
535 | numrep += result_max; | 550 | numrep += result_max; |
536 | if (numrep <= 0) | 551 | if (numrep <= 0) |
537 | continue; | 552 | continue; |
538 | } | 553 | } |
539 | j = 0; | 554 | j = 0; |
555 | if (osize == 0 && force_pos >= 0) { | ||
556 | /* skip any intermediate types */ | ||
557 | while (force_pos && | ||
558 | force_context[force_pos] < 0 && | ||
559 | rule->steps[step].arg2 != | ||
560 | map->buckets[-1 - | ||
561 | force_context[force_pos]]->type) | ||
562 | force_pos--; | ||
563 | o[osize] = force_context[force_pos]; | ||
564 | if (recurse_to_leaf) | ||
565 | c[osize] = force_context[0]; | ||
566 | j++; | ||
567 | force_pos--; | ||
568 | } | ||
540 | osize += crush_choose(map, | 569 | osize += crush_choose(map, |
541 | map->buckets[-1-w[i]], | 570 | map->buckets[-1-w[i]], |
542 | weight, | 571 | weight, |
543 | x, numrep, | 572 | x, numrep, |
544 | curstep->arg2, | 573 | rule->steps[step].arg2, |
545 | o+osize, j, | 574 | o+osize, j, |
546 | firstn, | 575 | firstn, |
547 | recurse_to_leaf, c+osize); | 576 | recurse_to_leaf, c+osize); |
@@ -568,12 +597,13 @@ int crush_do_rule(const struct crush_map *map, | |||
568 | break; | 597 | break; |
569 | 598 | ||
570 | default: | 599 | default: |
571 | dprintk(" unknown op %d at step %d\n", | 600 | BUG_ON(1); |
572 | curstep->op, step); | ||
573 | break; | ||
574 | } | 601 | } |
575 | } | 602 | } |
576 | return result_len; | 603 | rc = result_len; |
604 | |||
605 | out: | ||
606 | return rc; | ||
577 | } | 607 | } |
578 | 608 | ||
579 | 609 | ||
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index af14cb42516..85f3bc0a706 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c | |||
@@ -15,9 +15,10 @@ int ceph_crypto_key_clone(struct ceph_crypto_key *dst, | |||
15 | const struct ceph_crypto_key *src) | 15 | const struct ceph_crypto_key *src) |
16 | { | 16 | { |
17 | memcpy(dst, src, sizeof(struct ceph_crypto_key)); | 17 | memcpy(dst, src, sizeof(struct ceph_crypto_key)); |
18 | dst->key = kmemdup(src->key, src->len, GFP_NOFS); | 18 | dst->key = kmalloc(src->len, GFP_NOFS); |
19 | if (!dst->key) | 19 | if (!dst->key) |
20 | return -ENOMEM; | 20 | return -ENOMEM; |
21 | memcpy(dst->key, src->key, src->len); | ||
21 | return 0; | 22 | return 0; |
22 | } | 23 | } |
23 | 24 | ||
@@ -423,15 +424,14 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, | |||
423 | } | 424 | } |
424 | } | 425 | } |
425 | 426 | ||
426 | int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) | 427 | int ceph_key_instantiate(struct key *key, const void *data, size_t datalen) |
427 | { | 428 | { |
428 | struct ceph_crypto_key *ckey; | 429 | struct ceph_crypto_key *ckey; |
429 | size_t datalen = prep->datalen; | ||
430 | int ret; | 430 | int ret; |
431 | void *p; | 431 | void *p; |
432 | 432 | ||
433 | ret = -EINVAL; | 433 | ret = -EINVAL; |
434 | if (datalen <= 0 || datalen > 32767 || !prep->data) | 434 | if (datalen <= 0 || datalen > 32767 || !data) |
435 | goto err; | 435 | goto err; |
436 | 436 | ||
437 | ret = key_payload_reserve(key, datalen); | 437 | ret = key_payload_reserve(key, datalen); |
@@ -444,8 +444,8 @@ int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) | |||
444 | goto err; | 444 | goto err; |
445 | 445 | ||
446 | /* TODO ceph_crypto_key_decode should really take const input */ | 446 | /* TODO ceph_crypto_key_decode should really take const input */ |
447 | p = (void *)prep->data; | 447 | p = (void *)data; |
448 | ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); | 448 | ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen); |
449 | if (ret < 0) | 449 | if (ret < 0) |
450 | goto err_ckey; | 450 | goto err_ckey; |
451 | 451 | ||
@@ -467,7 +467,6 @@ void ceph_key_destroy(struct key *key) { | |||
467 | struct ceph_crypto_key *ckey = key->payload.data; | 467 | struct ceph_crypto_key *ckey = key->payload.data; |
468 | 468 | ||
469 | ceph_crypto_key_destroy(ckey); | 469 | ceph_crypto_key_destroy(ckey); |
470 | kfree(ckey); | ||
471 | } | 470 | } |
472 | 471 | ||
473 | struct key_type key_type_ceph = { | 472 | struct key_type key_type_ceph = { |
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 3572dc518bc..1919d1550d7 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h | |||
@@ -16,8 +16,7 @@ struct ceph_crypto_key { | |||
16 | 16 | ||
17 | static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) | 17 | static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) |
18 | { | 18 | { |
19 | if (key) | 19 | kfree(key->key); |
20 | kfree(key->key); | ||
21 | } | 20 | } |
22 | 21 | ||
23 | extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, | 22 | extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, |
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 38b5dc1823d..27d4ea315d1 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c | |||
@@ -94,9 +94,9 @@ static int monc_show(struct seq_file *s, void *p) | |||
94 | mutex_lock(&monc->mutex); | 94 | mutex_lock(&monc->mutex); |
95 | 95 | ||
96 | if (monc->have_mdsmap) | 96 | if (monc->have_mdsmap) |
97 | seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); | 97 | seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); |
98 | if (monc->have_osdmap) | 98 | if (monc->have_osdmap) |
99 | seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); | 99 | seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); |
100 | if (monc->want_next_osdmap) | 100 | if (monc->want_next_osdmap) |
101 | seq_printf(s, "want next osdmap\n"); | 101 | seq_printf(s, "want next osdmap\n"); |
102 | 102 | ||
@@ -146,7 +146,7 @@ static int osdc_show(struct seq_file *s, void *pp) | |||
146 | 146 | ||
147 | if (req->r_reassert_version.epoch) | 147 | if (req->r_reassert_version.epoch) |
148 | seq_printf(s, "\t%u'%llu", | 148 | seq_printf(s, "\t%u'%llu", |
149 | (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), | 149 | (unsigned)le32_to_cpu(req->r_reassert_version.epoch), |
150 | le64_to_cpu(req->r_reassert_version.version)); | 150 | le64_to_cpu(req->r_reassert_version.version)); |
151 | else | 151 | else |
152 | seq_printf(s, "\t"); | 152 | seq_printf(s, "\t"); |
@@ -189,9 +189,6 @@ int ceph_debugfs_client_init(struct ceph_client *client) | |||
189 | snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, | 189 | snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, |
190 | client->monc.auth->global_id); | 190 | client->monc.auth->global_id); |
191 | 191 | ||
192 | dout("ceph_debugfs_client_init %p %s\n", client, name); | ||
193 | |||
194 | BUG_ON(client->debugfs_dir); | ||
195 | client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); | 192 | client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); |
196 | if (!client->debugfs_dir) | 193 | if (!client->debugfs_dir) |
197 | goto out; | 194 | goto out; |
@@ -237,7 +234,6 @@ out: | |||
237 | 234 | ||
238 | void ceph_debugfs_client_cleanup(struct ceph_client *client) | 235 | void ceph_debugfs_client_cleanup(struct ceph_client *client) |
239 | { | 236 | { |
240 | dout("ceph_debugfs_client_cleanup %p\n", client); | ||
241 | debugfs_remove(client->debugfs_osdmap); | 237 | debugfs_remove(client->debugfs_osdmap); |
242 | debugfs_remove(client->debugfs_monmap); | 238 | debugfs_remove(client->debugfs_monmap); |
243 | debugfs_remove(client->osdc.debugfs_file); | 239 | debugfs_remove(client->osdc.debugfs_file); |
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5ccf87ed8d6..9918e9eb276 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c | |||
@@ -11,14 +11,12 @@ | |||
11 | #include <linux/string.h> | 11 | #include <linux/string.h> |
12 | #include <linux/bio.h> | 12 | #include <linux/bio.h> |
13 | #include <linux/blkdev.h> | 13 | #include <linux/blkdev.h> |
14 | #include <linux/dns_resolver.h> | ||
15 | #include <net/tcp.h> | 14 | #include <net/tcp.h> |
16 | 15 | ||
17 | #include <linux/ceph/libceph.h> | 16 | #include <linux/ceph/libceph.h> |
18 | #include <linux/ceph/messenger.h> | 17 | #include <linux/ceph/messenger.h> |
19 | #include <linux/ceph/decode.h> | 18 | #include <linux/ceph/decode.h> |
20 | #include <linux/ceph/pagelist.h> | 19 | #include <linux/ceph/pagelist.h> |
21 | #include <linux/export.h> | ||
22 | 20 | ||
23 | /* | 21 | /* |
24 | * Ceph uses the messenger to exchange ceph_msg messages with other | 22 | * Ceph uses the messenger to exchange ceph_msg messages with other |
@@ -29,74 +27,6 @@ | |||
29 | * the sender. | 27 | * the sender. |
30 | */ | 28 | */ |
31 | 29 | ||
32 | /* | ||
33 | * We track the state of the socket on a given connection using | ||
34 | * values defined below. The transition to a new socket state is | ||
35 | * handled by a function which verifies we aren't coming from an | ||
36 | * unexpected state. | ||
37 | * | ||
38 | * -------- | ||
39 | * | NEW* | transient initial state | ||
40 | * -------- | ||
41 | * | con_sock_state_init() | ||
42 | * v | ||
43 | * ---------- | ||
44 | * | CLOSED | initialized, but no socket (and no | ||
45 | * ---------- TCP connection) | ||
46 | * ^ \ | ||
47 | * | \ con_sock_state_connecting() | ||
48 | * | ---------------------- | ||
49 | * | \ | ||
50 | * + con_sock_state_closed() \ | ||
51 | * |+--------------------------- \ | ||
52 | * | \ \ \ | ||
53 | * | ----------- \ \ | ||
54 | * | | CLOSING | socket event; \ \ | ||
55 | * | ----------- await close \ \ | ||
56 | * | ^ \ | | ||
57 | * | | \ | | ||
58 | * | + con_sock_state_closing() \ | | ||
59 | * | / \ | | | ||
60 | * | / --------------- | | | ||
61 | * | / \ v v | ||
62 | * | / -------------- | ||
63 | * | / -----------------| CONNECTING | socket created, TCP | ||
64 | * | | / -------------- connect initiated | ||
65 | * | | | con_sock_state_connected() | ||
66 | * | | v | ||
67 | * ------------- | ||
68 | * | CONNECTED | TCP connection established | ||
69 | * ------------- | ||
70 | * | ||
71 | * State values for ceph_connection->sock_state; NEW is assumed to be 0. | ||
72 | */ | ||
73 | |||
74 | #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ | ||
75 | #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ | ||
76 | #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ | ||
77 | #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ | ||
78 | #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ | ||
79 | |||
80 | /* | ||
81 | * connection states | ||
82 | */ | ||
83 | #define CON_STATE_CLOSED 1 /* -> PREOPEN */ | ||
84 | #define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ | ||
85 | #define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ | ||
86 | #define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ | ||
87 | #define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ | ||
88 | #define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ | ||
89 | |||
90 | /* | ||
91 | * ceph_connection flag bits | ||
92 | */ | ||
93 | #define CON_FLAG_LOSSYTX 0 /* we can close channel or drop | ||
94 | * messages on errors */ | ||
95 | #define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ | ||
96 | #define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ | ||
97 | #define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ | ||
98 | #define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ | ||
99 | |||
100 | /* static tag bytes (protocol control messages) */ | 30 | /* static tag bytes (protocol control messages) */ |
101 | static char tag_msg = CEPH_MSGR_TAG_MSG; | 31 | static char tag_msg = CEPH_MSGR_TAG_MSG; |
102 | static char tag_ack = CEPH_MSGR_TAG_ACK; | 32 | static char tag_ack = CEPH_MSGR_TAG_ACK; |
@@ -106,54 +36,48 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; | |||
106 | static struct lock_class_key socket_class; | 36 | static struct lock_class_key socket_class; |
107 | #endif | 37 | #endif |
108 | 38 | ||
109 | /* | ||
110 | * When skipping (ignoring) a block of input we read it into a "skip | ||
111 | * buffer," which is this many bytes in size. | ||
112 | */ | ||
113 | #define SKIP_BUF_SIZE 1024 | ||
114 | 39 | ||
115 | static void queue_con(struct ceph_connection *con); | 40 | static void queue_con(struct ceph_connection *con); |
116 | static void con_work(struct work_struct *); | 41 | static void con_work(struct work_struct *); |
117 | static void ceph_fault(struct ceph_connection *con); | 42 | static void ceph_fault(struct ceph_connection *con); |
118 | 43 | ||
119 | /* | 44 | /* |
120 | * Nicely render a sockaddr as a string. An array of formatted | 45 | * nicely render a sockaddr as a string. |
121 | * strings is used, to approximate reentrancy. | ||
122 | */ | 46 | */ |
123 | #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ | 47 | #define MAX_ADDR_STR 20 |
124 | #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) | 48 | #define MAX_ADDR_STR_LEN 60 |
125 | #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) | 49 | static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; |
126 | #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ | 50 | static DEFINE_SPINLOCK(addr_str_lock); |
127 | 51 | static int last_addr_str; | |
128 | static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; | ||
129 | static atomic_t addr_str_seq = ATOMIC_INIT(0); | ||
130 | |||
131 | static struct page *zero_page; /* used in certain error cases */ | ||
132 | 52 | ||
133 | const char *ceph_pr_addr(const struct sockaddr_storage *ss) | 53 | const char *ceph_pr_addr(const struct sockaddr_storage *ss) |
134 | { | 54 | { |
135 | int i; | 55 | int i; |
136 | char *s; | 56 | char *s; |
137 | struct sockaddr_in *in4 = (struct sockaddr_in *) ss; | 57 | struct sockaddr_in *in4 = (void *)ss; |
138 | struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; | 58 | struct sockaddr_in6 *in6 = (void *)ss; |
139 | 59 | ||
140 | i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; | 60 | spin_lock(&addr_str_lock); |
61 | i = last_addr_str++; | ||
62 | if (last_addr_str == MAX_ADDR_STR) | ||
63 | last_addr_str = 0; | ||
64 | spin_unlock(&addr_str_lock); | ||
141 | s = addr_str[i]; | 65 | s = addr_str[i]; |
142 | 66 | ||
143 | switch (ss->ss_family) { | 67 | switch (ss->ss_family) { |
144 | case AF_INET: | 68 | case AF_INET: |
145 | snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, | 69 | snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, |
146 | ntohs(in4->sin_port)); | 70 | (unsigned int)ntohs(in4->sin_port)); |
147 | break; | 71 | break; |
148 | 72 | ||
149 | case AF_INET6: | 73 | case AF_INET6: |
150 | snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, | 74 | snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, |
151 | ntohs(in6->sin6_port)); | 75 | (unsigned int)ntohs(in6->sin6_port)); |
152 | break; | 76 | break; |
153 | 77 | ||
154 | default: | 78 | default: |
155 | snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", | 79 | snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", |
156 | ss->ss_family); | 80 | (int)ss->ss_family); |
157 | } | 81 | } |
158 | 82 | ||
159 | return s; | 83 | return s; |
@@ -169,43 +93,22 @@ static void encode_my_addr(struct ceph_messenger *msgr) | |||
169 | /* | 93 | /* |
170 | * work queue for all reading and writing to/from the socket. | 94 | * work queue for all reading and writing to/from the socket. |
171 | */ | 95 | */ |
172 | static struct workqueue_struct *ceph_msgr_wq; | 96 | struct workqueue_struct *ceph_msgr_wq; |
173 | |||
174 | void _ceph_msgr_exit(void) | ||
175 | { | ||
176 | if (ceph_msgr_wq) { | ||
177 | destroy_workqueue(ceph_msgr_wq); | ||
178 | ceph_msgr_wq = NULL; | ||
179 | } | ||
180 | |||
181 | BUG_ON(zero_page == NULL); | ||
182 | kunmap(zero_page); | ||
183 | page_cache_release(zero_page); | ||
184 | zero_page = NULL; | ||
185 | } | ||
186 | 97 | ||
187 | int ceph_msgr_init(void) | 98 | int ceph_msgr_init(void) |
188 | { | 99 | { |
189 | BUG_ON(zero_page != NULL); | ||
190 | zero_page = ZERO_PAGE(0); | ||
191 | page_cache_get(zero_page); | ||
192 | |||
193 | ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); | 100 | ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); |
194 | if (ceph_msgr_wq) | 101 | if (!ceph_msgr_wq) { |
195 | return 0; | 102 | pr_err("msgr_init failed to create workqueue\n"); |
196 | 103 | return -ENOMEM; | |
197 | pr_err("msgr_init failed to create workqueue\n"); | 104 | } |
198 | _ceph_msgr_exit(); | 105 | return 0; |
199 | |||
200 | return -ENOMEM; | ||
201 | } | 106 | } |
202 | EXPORT_SYMBOL(ceph_msgr_init); | 107 | EXPORT_SYMBOL(ceph_msgr_init); |
203 | 108 | ||
204 | void ceph_msgr_exit(void) | 109 | void ceph_msgr_exit(void) |
205 | { | 110 | { |
206 | BUG_ON(ceph_msgr_wq == NULL); | 111 | destroy_workqueue(ceph_msgr_wq); |
207 | |||
208 | _ceph_msgr_exit(); | ||
209 | } | 112 | } |
210 | EXPORT_SYMBOL(ceph_msgr_exit); | 113 | EXPORT_SYMBOL(ceph_msgr_exit); |
211 | 114 | ||
@@ -215,134 +118,70 @@ void ceph_msgr_flush(void) | |||
215 | } | 118 | } |
216 | EXPORT_SYMBOL(ceph_msgr_flush); | 119 | EXPORT_SYMBOL(ceph_msgr_flush); |
217 | 120 | ||
218 | /* Connection socket state transition functions */ | ||
219 | |||
220 | static void con_sock_state_init(struct ceph_connection *con) | ||
221 | { | ||
222 | int old_state; | ||
223 | |||
224 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); | ||
225 | if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) | ||
226 | printk("%s: unexpected old state %d\n", __func__, old_state); | ||
227 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, | ||
228 | CON_SOCK_STATE_CLOSED); | ||
229 | } | ||
230 | |||
231 | static void con_sock_state_connecting(struct ceph_connection *con) | ||
232 | { | ||
233 | int old_state; | ||
234 | |||
235 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); | ||
236 | if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) | ||
237 | printk("%s: unexpected old state %d\n", __func__, old_state); | ||
238 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, | ||
239 | CON_SOCK_STATE_CONNECTING); | ||
240 | } | ||
241 | |||
242 | static void con_sock_state_connected(struct ceph_connection *con) | ||
243 | { | ||
244 | int old_state; | ||
245 | |||
246 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); | ||
247 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) | ||
248 | printk("%s: unexpected old state %d\n", __func__, old_state); | ||
249 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, | ||
250 | CON_SOCK_STATE_CONNECTED); | ||
251 | } | ||
252 | |||
253 | static void con_sock_state_closing(struct ceph_connection *con) | ||
254 | { | ||
255 | int old_state; | ||
256 | |||
257 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); | ||
258 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && | ||
259 | old_state != CON_SOCK_STATE_CONNECTED && | ||
260 | old_state != CON_SOCK_STATE_CLOSING)) | ||
261 | printk("%s: unexpected old state %d\n", __func__, old_state); | ||
262 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, | ||
263 | CON_SOCK_STATE_CLOSING); | ||
264 | } | ||
265 | |||
266 | static void con_sock_state_closed(struct ceph_connection *con) | ||
267 | { | ||
268 | int old_state; | ||
269 | |||
270 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); | ||
271 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && | ||
272 | old_state != CON_SOCK_STATE_CLOSING && | ||
273 | old_state != CON_SOCK_STATE_CONNECTING && | ||
274 | old_state != CON_SOCK_STATE_CLOSED)) | ||
275 | printk("%s: unexpected old state %d\n", __func__, old_state); | ||
276 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, | ||
277 | CON_SOCK_STATE_CLOSED); | ||
278 | } | ||
279 | 121 | ||
280 | /* | 122 | /* |
281 | * socket callback functions | 123 | * socket callback functions |
282 | */ | 124 | */ |
283 | 125 | ||
284 | /* data available on socket, or listen socket received a connect */ | 126 | /* data available on socket, or listen socket received a connect */ |
285 | static void ceph_sock_data_ready(struct sock *sk, int count_unused) | 127 | static void ceph_data_ready(struct sock *sk, int count_unused) |
286 | { | 128 | { |
287 | struct ceph_connection *con = sk->sk_user_data; | 129 | struct ceph_connection *con = |
288 | if (atomic_read(&con->msgr->stopping)) { | 130 | (struct ceph_connection *)sk->sk_user_data; |
289 | return; | ||
290 | } | ||
291 | |||
292 | if (sk->sk_state != TCP_CLOSE_WAIT) { | 131 | if (sk->sk_state != TCP_CLOSE_WAIT) { |
293 | dout("%s on %p state = %lu, queueing work\n", __func__, | 132 | dout("ceph_data_ready on %p state = %lu, queueing work\n", |
294 | con, con->state); | 133 | con, con->state); |
295 | queue_con(con); | 134 | queue_con(con); |
296 | } | 135 | } |
297 | } | 136 | } |
298 | 137 | ||
299 | /* socket has buffer space for writing */ | 138 | /* socket has buffer space for writing */ |
300 | static void ceph_sock_write_space(struct sock *sk) | 139 | static void ceph_write_space(struct sock *sk) |
301 | { | 140 | { |
302 | struct ceph_connection *con = sk->sk_user_data; | 141 | struct ceph_connection *con = |
142 | (struct ceph_connection *)sk->sk_user_data; | ||
303 | 143 | ||
304 | /* only queue to workqueue if there is data we want to write, | 144 | /* only queue to workqueue if there is data we want to write. */ |
305 | * and there is sufficient space in the socket buffer to accept | 145 | if (test_bit(WRITE_PENDING, &con->state)) { |
306 | * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() | 146 | dout("ceph_write_space %p queueing write work\n", con); |
307 | * doesn't get called again until try_write() fills the socket | 147 | queue_con(con); |
308 | * buffer. See net/ipv4/tcp_input.c:tcp_check_space() | ||
309 | * and net/core/stream.c:sk_stream_write_space(). | ||
310 | */ | ||
311 | if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { | ||
312 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { | ||
313 | dout("%s %p queueing write work\n", __func__, con); | ||
314 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
315 | queue_con(con); | ||
316 | } | ||
317 | } else { | 148 | } else { |
318 | dout("%s %p nothing to write\n", __func__, con); | 149 | dout("ceph_write_space %p nothing to write\n", con); |
319 | } | 150 | } |
151 | |||
152 | /* since we have our own write_space, clear the SOCK_NOSPACE flag */ | ||
153 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
320 | } | 154 | } |
321 | 155 | ||
322 | /* socket's state has changed */ | 156 | /* socket's state has changed */ |
323 | static void ceph_sock_state_change(struct sock *sk) | 157 | static void ceph_state_change(struct sock *sk) |
324 | { | 158 | { |
325 | struct ceph_connection *con = sk->sk_user_data; | 159 | struct ceph_connection *con = |
160 | (struct ceph_connection *)sk->sk_user_data; | ||
326 | 161 | ||
327 | dout("%s %p state = %lu sk_state = %u\n", __func__, | 162 | dout("ceph_state_change %p state = %lu sk_state = %u\n", |
328 | con, con->state, sk->sk_state); | 163 | con, con->state, sk->sk_state); |
329 | 164 | ||
165 | if (test_bit(CLOSED, &con->state)) | ||
166 | return; | ||
167 | |||
330 | switch (sk->sk_state) { | 168 | switch (sk->sk_state) { |
331 | case TCP_CLOSE: | 169 | case TCP_CLOSE: |
332 | dout("%s TCP_CLOSE\n", __func__); | 170 | dout("ceph_state_change TCP_CLOSE\n"); |
333 | case TCP_CLOSE_WAIT: | 171 | case TCP_CLOSE_WAIT: |
334 | dout("%s TCP_CLOSE_WAIT\n", __func__); | 172 | dout("ceph_state_change TCP_CLOSE_WAIT\n"); |
335 | con_sock_state_closing(con); | 173 | if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { |
336 | set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); | 174 | if (test_bit(CONNECTING, &con->state)) |
337 | queue_con(con); | 175 | con->error_msg = "connection failed"; |
176 | else | ||
177 | con->error_msg = "socket closed"; | ||
178 | queue_con(con); | ||
179 | } | ||
338 | break; | 180 | break; |
339 | case TCP_ESTABLISHED: | 181 | case TCP_ESTABLISHED: |
340 | dout("%s TCP_ESTABLISHED\n", __func__); | 182 | dout("ceph_state_change TCP_ESTABLISHED\n"); |
341 | con_sock_state_connected(con); | ||
342 | queue_con(con); | 183 | queue_con(con); |
343 | break; | 184 | break; |
344 | default: /* Everything else is uninteresting */ | ||
345 | break; | ||
346 | } | 185 | } |
347 | } | 186 | } |
348 | 187 | ||
@@ -353,10 +192,10 @@ static void set_sock_callbacks(struct socket *sock, | |||
353 | struct ceph_connection *con) | 192 | struct ceph_connection *con) |
354 | { | 193 | { |
355 | struct sock *sk = sock->sk; | 194 | struct sock *sk = sock->sk; |
356 | sk->sk_user_data = con; | 195 | sk->sk_user_data = (void *)con; |
357 | sk->sk_data_ready = ceph_sock_data_ready; | 196 | sk->sk_data_ready = ceph_data_ready; |
358 | sk->sk_write_space = ceph_sock_write_space; | 197 | sk->sk_write_space = ceph_write_space; |
359 | sk->sk_state_change = ceph_sock_state_change; | 198 | sk->sk_state_change = ceph_state_change; |
360 | } | 199 | } |
361 | 200 | ||
362 | 201 | ||
@@ -367,7 +206,7 @@ static void set_sock_callbacks(struct socket *sock, | |||
367 | /* | 206 | /* |
368 | * initiate connection to a remote socket. | 207 | * initiate connection to a remote socket. |
369 | */ | 208 | */ |
370 | static int ceph_tcp_connect(struct ceph_connection *con) | 209 | static struct socket *ceph_tcp_connect(struct ceph_connection *con) |
371 | { | 210 | { |
372 | struct sockaddr_storage *paddr = &con->peer_addr.in_addr; | 211 | struct sockaddr_storage *paddr = &con->peer_addr.in_addr; |
373 | struct socket *sock; | 212 | struct socket *sock; |
@@ -377,7 +216,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) | |||
377 | ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, | 216 | ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, |
378 | IPPROTO_TCP, &sock); | 217 | IPPROTO_TCP, &sock); |
379 | if (ret) | 218 | if (ret) |
380 | return ret; | 219 | return ERR_PTR(ret); |
220 | con->sock = sock; | ||
381 | sock->sk->sk_allocation = GFP_NOFS; | 221 | sock->sk->sk_allocation = GFP_NOFS; |
382 | 222 | ||
383 | #ifdef CONFIG_LOCKDEP | 223 | #ifdef CONFIG_LOCKDEP |
@@ -388,23 +228,25 @@ static int ceph_tcp_connect(struct ceph_connection *con) | |||
388 | 228 | ||
389 | dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); | 229 | dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); |
390 | 230 | ||
391 | con_sock_state_connecting(con); | ||
392 | ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), | 231 | ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), |
393 | O_NONBLOCK); | 232 | O_NONBLOCK); |
394 | if (ret == -EINPROGRESS) { | 233 | if (ret == -EINPROGRESS) { |
395 | dout("connect %s EINPROGRESS sk_state = %u\n", | 234 | dout("connect %s EINPROGRESS sk_state = %u\n", |
396 | ceph_pr_addr(&con->peer_addr.in_addr), | 235 | ceph_pr_addr(&con->peer_addr.in_addr), |
397 | sock->sk->sk_state); | 236 | sock->sk->sk_state); |
398 | } else if (ret < 0) { | 237 | ret = 0; |
238 | } | ||
239 | if (ret < 0) { | ||
399 | pr_err("connect %s error %d\n", | 240 | pr_err("connect %s error %d\n", |
400 | ceph_pr_addr(&con->peer_addr.in_addr), ret); | 241 | ceph_pr_addr(&con->peer_addr.in_addr), ret); |
401 | sock_release(sock); | 242 | sock_release(sock); |
243 | con->sock = NULL; | ||
402 | con->error_msg = "connect error"; | 244 | con->error_msg = "connect error"; |
403 | |||
404 | return ret; | ||
405 | } | 245 | } |
406 | con->sock = sock; | 246 | |
407 | return 0; | 247 | if (ret < 0) |
248 | return ERR_PTR(ret); | ||
249 | return sock; | ||
408 | } | 250 | } |
409 | 251 | ||
410 | static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) | 252 | static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) |
@@ -440,43 +282,22 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, | |||
440 | return r; | 282 | return r; |
441 | } | 283 | } |
442 | 284 | ||
443 | static int ceph_tcp_sendpage(struct socket *sock, struct page *page, | ||
444 | int offset, size_t size, int more) | ||
445 | { | ||
446 | int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); | ||
447 | int ret; | ||
448 | |||
449 | ret = kernel_sendpage(sock, page, offset, size, flags); | ||
450 | if (ret == -EAGAIN) | ||
451 | ret = 0; | ||
452 | |||
453 | return ret; | ||
454 | } | ||
455 | |||
456 | 285 | ||
457 | /* | 286 | /* |
458 | * Shutdown/close the socket for the given connection. | 287 | * Shutdown/close the socket for the given connection. |
459 | */ | 288 | */ |
460 | static int con_close_socket(struct ceph_connection *con) | 289 | static int con_close_socket(struct ceph_connection *con) |
461 | { | 290 | { |
462 | int rc = 0; | 291 | int rc; |
463 | 292 | ||
464 | dout("con_close_socket on %p sock %p\n", con, con->sock); | 293 | dout("con_close_socket on %p sock %p\n", con, con->sock); |
465 | if (con->sock) { | 294 | if (!con->sock) |
466 | rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); | 295 | return 0; |
467 | sock_release(con->sock); | 296 | set_bit(SOCK_CLOSED, &con->state); |
468 | con->sock = NULL; | 297 | rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); |
469 | } | 298 | sock_release(con->sock); |
470 | 299 | con->sock = NULL; | |
471 | /* | 300 | clear_bit(SOCK_CLOSED, &con->state); |
472 | * Forcibly clear the SOCK_CLOSED flag. It gets set | ||
473 | * independent of the connection mutex, and we could have | ||
474 | * received a socket close event before we had the chance to | ||
475 | * shut the socket down. | ||
476 | */ | ||
477 | clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); | ||
478 | |||
479 | con_sock_state_closed(con); | ||
480 | return rc; | 301 | return rc; |
481 | } | 302 | } |
482 | 303 | ||
@@ -487,10 +308,6 @@ static int con_close_socket(struct ceph_connection *con) | |||
487 | static void ceph_msg_remove(struct ceph_msg *msg) | 308 | static void ceph_msg_remove(struct ceph_msg *msg) |
488 | { | 309 | { |
489 | list_del_init(&msg->list_head); | 310 | list_del_init(&msg->list_head); |
490 | BUG_ON(msg->con == NULL); | ||
491 | msg->con->ops->put(msg->con); | ||
492 | msg->con = NULL; | ||
493 | |||
494 | ceph_msg_put(msg); | 311 | ceph_msg_put(msg); |
495 | } | 312 | } |
496 | static void ceph_msg_remove_list(struct list_head *head) | 313 | static void ceph_msg_remove_list(struct list_head *head) |
@@ -506,16 +323,12 @@ static void reset_connection(struct ceph_connection *con) | |||
506 | { | 323 | { |
507 | /* reset connection, out_queue, msg_ and connect_seq */ | 324 | /* reset connection, out_queue, msg_ and connect_seq */ |
508 | /* discard existing out_queue and msg_seq */ | 325 | /* discard existing out_queue and msg_seq */ |
509 | dout("reset_connection %p\n", con); | ||
510 | ceph_msg_remove_list(&con->out_queue); | 326 | ceph_msg_remove_list(&con->out_queue); |
511 | ceph_msg_remove_list(&con->out_sent); | 327 | ceph_msg_remove_list(&con->out_sent); |
512 | 328 | ||
513 | if (con->in_msg) { | 329 | if (con->in_msg) { |
514 | BUG_ON(con->in_msg->con != con); | ||
515 | con->in_msg->con = NULL; | ||
516 | ceph_msg_put(con->in_msg); | 330 | ceph_msg_put(con->in_msg); |
517 | con->in_msg = NULL; | 331 | con->in_msg = NULL; |
518 | con->ops->put(con); | ||
519 | } | 332 | } |
520 | 333 | ||
521 | con->connect_seq = 0; | 334 | con->connect_seq = 0; |
@@ -533,44 +346,32 @@ static void reset_connection(struct ceph_connection *con) | |||
533 | */ | 346 | */ |
534 | void ceph_con_close(struct ceph_connection *con) | 347 | void ceph_con_close(struct ceph_connection *con) |
535 | { | 348 | { |
536 | mutex_lock(&con->mutex); | ||
537 | dout("con_close %p peer %s\n", con, | 349 | dout("con_close %p peer %s\n", con, |
538 | ceph_pr_addr(&con->peer_addr.in_addr)); | 350 | ceph_pr_addr(&con->peer_addr.in_addr)); |
539 | con->state = CON_STATE_CLOSED; | 351 | set_bit(CLOSED, &con->state); /* in case there's queued work */ |
540 | 352 | clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ | |
541 | clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ | 353 | clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ |
542 | clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); | 354 | clear_bit(KEEPALIVE_PENDING, &con->state); |
543 | clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 355 | clear_bit(WRITE_PENDING, &con->state); |
544 | clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); | 356 | mutex_lock(&con->mutex); |
545 | clear_bit(CON_FLAG_BACKOFF, &con->flags); | ||
546 | |||
547 | reset_connection(con); | 357 | reset_connection(con); |
548 | con->peer_global_seq = 0; | 358 | con->peer_global_seq = 0; |
549 | cancel_delayed_work(&con->work); | 359 | cancel_delayed_work(&con->work); |
550 | con_close_socket(con); | ||
551 | mutex_unlock(&con->mutex); | 360 | mutex_unlock(&con->mutex); |
361 | queue_con(con); | ||
552 | } | 362 | } |
553 | EXPORT_SYMBOL(ceph_con_close); | 363 | EXPORT_SYMBOL(ceph_con_close); |
554 | 364 | ||
555 | /* | 365 | /* |
556 | * Reopen a closed connection, with a new peer address. | 366 | * Reopen a closed connection, with a new peer address. |
557 | */ | 367 | */ |
558 | void ceph_con_open(struct ceph_connection *con, | 368 | void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) |
559 | __u8 entity_type, __u64 entity_num, | ||
560 | struct ceph_entity_addr *addr) | ||
561 | { | 369 | { |
562 | mutex_lock(&con->mutex); | ||
563 | dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); | 370 | dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); |
564 | 371 | set_bit(OPENING, &con->state); | |
565 | WARN_ON(con->state != CON_STATE_CLOSED); | 372 | clear_bit(CLOSED, &con->state); |
566 | con->state = CON_STATE_PREOPEN; | ||
567 | |||
568 | con->peer_name.type = (__u8) entity_type; | ||
569 | con->peer_name.num = cpu_to_le64(entity_num); | ||
570 | |||
571 | memcpy(&con->peer_addr, addr, sizeof(*addr)); | 373 | memcpy(&con->peer_addr, addr, sizeof(*addr)); |
572 | con->delay = 0; /* reset backoff memory */ | 374 | con->delay = 0; /* reset backoff memory */ |
573 | mutex_unlock(&con->mutex); | ||
574 | queue_con(con); | 375 | queue_con(con); |
575 | } | 376 | } |
576 | EXPORT_SYMBOL(ceph_con_open); | 377 | EXPORT_SYMBOL(ceph_con_open); |
@@ -584,26 +385,41 @@ bool ceph_con_opened(struct ceph_connection *con) | |||
584 | } | 385 | } |
585 | 386 | ||
586 | /* | 387 | /* |
388 | * generic get/put | ||
389 | */ | ||
390 | struct ceph_connection *ceph_con_get(struct ceph_connection *con) | ||
391 | { | ||
392 | dout("con_get %p nref = %d -> %d\n", con, | ||
393 | atomic_read(&con->nref), atomic_read(&con->nref) + 1); | ||
394 | if (atomic_inc_not_zero(&con->nref)) | ||
395 | return con; | ||
396 | return NULL; | ||
397 | } | ||
398 | |||
399 | void ceph_con_put(struct ceph_connection *con) | ||
400 | { | ||
401 | dout("con_put %p nref = %d -> %d\n", con, | ||
402 | atomic_read(&con->nref), atomic_read(&con->nref) - 1); | ||
403 | BUG_ON(atomic_read(&con->nref) == 0); | ||
404 | if (atomic_dec_and_test(&con->nref)) { | ||
405 | BUG_ON(con->sock); | ||
406 | kfree(con); | ||
407 | } | ||
408 | } | ||
409 | |||
410 | /* | ||
587 | * initialize a new connection. | 411 | * initialize a new connection. |
588 | */ | 412 | */ |
589 | void ceph_con_init(struct ceph_connection *con, void *private, | 413 | void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) |
590 | const struct ceph_connection_operations *ops, | ||
591 | struct ceph_messenger *msgr) | ||
592 | { | 414 | { |
593 | dout("con_init %p\n", con); | 415 | dout("con_init %p\n", con); |
594 | memset(con, 0, sizeof(*con)); | 416 | memset(con, 0, sizeof(*con)); |
595 | con->private = private; | 417 | atomic_set(&con->nref, 1); |
596 | con->ops = ops; | ||
597 | con->msgr = msgr; | 418 | con->msgr = msgr; |
598 | |||
599 | con_sock_state_init(con); | ||
600 | |||
601 | mutex_init(&con->mutex); | 419 | mutex_init(&con->mutex); |
602 | INIT_LIST_HEAD(&con->out_queue); | 420 | INIT_LIST_HEAD(&con->out_queue); |
603 | INIT_LIST_HEAD(&con->out_sent); | 421 | INIT_LIST_HEAD(&con->out_sent); |
604 | INIT_DELAYED_WORK(&con->work, con_work); | 422 | INIT_DELAYED_WORK(&con->work, con_work); |
605 | |||
606 | con->state = CON_STATE_CLOSED; | ||
607 | } | 423 | } |
608 | EXPORT_SYMBOL(ceph_con_init); | 424 | EXPORT_SYMBOL(ceph_con_init); |
609 | 425 | ||
@@ -624,84 +440,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) | |||
624 | return ret; | 440 | return ret; |
625 | } | 441 | } |
626 | 442 | ||
627 | static void con_out_kvec_reset(struct ceph_connection *con) | ||
628 | { | ||
629 | con->out_kvec_left = 0; | ||
630 | con->out_kvec_bytes = 0; | ||
631 | con->out_kvec_cur = &con->out_kvec[0]; | ||
632 | } | ||
633 | |||
634 | static void con_out_kvec_add(struct ceph_connection *con, | ||
635 | size_t size, void *data) | ||
636 | { | ||
637 | int index; | ||
638 | |||
639 | index = con->out_kvec_left; | ||
640 | BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); | ||
641 | |||
642 | con->out_kvec[index].iov_len = size; | ||
643 | con->out_kvec[index].iov_base = data; | ||
644 | con->out_kvec_left++; | ||
645 | con->out_kvec_bytes += size; | ||
646 | } | ||
647 | |||
648 | #ifdef CONFIG_BLOCK | ||
649 | static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) | ||
650 | { | ||
651 | if (!bio) { | ||
652 | *iter = NULL; | ||
653 | *seg = 0; | ||
654 | return; | ||
655 | } | ||
656 | *iter = bio; | ||
657 | *seg = bio->bi_idx; | ||
658 | } | ||
659 | |||
660 | static void iter_bio_next(struct bio **bio_iter, int *seg) | ||
661 | { | ||
662 | if (*bio_iter == NULL) | ||
663 | return; | ||
664 | |||
665 | BUG_ON(*seg >= (*bio_iter)->bi_vcnt); | ||
666 | |||
667 | (*seg)++; | ||
668 | if (*seg == (*bio_iter)->bi_vcnt) | ||
669 | init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); | ||
670 | } | ||
671 | #endif | ||
672 | |||
673 | static void prepare_write_message_data(struct ceph_connection *con) | ||
674 | { | ||
675 | struct ceph_msg *msg = con->out_msg; | ||
676 | |||
677 | BUG_ON(!msg); | ||
678 | BUG_ON(!msg->hdr.data_len); | ||
679 | |||
680 | /* initialize page iterator */ | ||
681 | con->out_msg_pos.page = 0; | ||
682 | if (msg->pages) | ||
683 | con->out_msg_pos.page_pos = msg->page_alignment; | ||
684 | else | ||
685 | con->out_msg_pos.page_pos = 0; | ||
686 | #ifdef CONFIG_BLOCK | ||
687 | if (msg->bio) | ||
688 | init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); | ||
689 | #endif | ||
690 | con->out_msg_pos.data_pos = 0; | ||
691 | con->out_msg_pos.did_page_crc = false; | ||
692 | con->out_more = 1; /* data + footer will follow */ | ||
693 | } | ||
694 | 443 | ||
695 | /* | 444 | /* |
696 | * Prepare footer for currently outgoing message, and finish things | 445 | * Prepare footer for currently outgoing message, and finish things |
697 | * off. Assumes out_kvec* are already valid.. we just add on to the end. | 446 | * off. Assumes out_kvec* are already valid.. we just add on to the end. |
698 | */ | 447 | */ |
699 | static void prepare_write_message_footer(struct ceph_connection *con) | 448 | static void prepare_write_message_footer(struct ceph_connection *con, int v) |
700 | { | 449 | { |
701 | struct ceph_msg *m = con->out_msg; | 450 | struct ceph_msg *m = con->out_msg; |
702 | int v = con->out_kvec_left; | ||
703 | |||
704 | m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; | ||
705 | 451 | ||
706 | dout("prepare_write_message_footer %p\n", con); | 452 | dout("prepare_write_message_footer %p\n", con); |
707 | con->out_kvec_is_msg = true; | 453 | con->out_kvec_is_msg = true; |
@@ -719,9 +465,9 @@ static void prepare_write_message_footer(struct ceph_connection *con) | |||
719 | static void prepare_write_message(struct ceph_connection *con) | 465 | static void prepare_write_message(struct ceph_connection *con) |
720 | { | 466 | { |
721 | struct ceph_msg *m; | 467 | struct ceph_msg *m; |
722 | u32 crc; | 468 | int v = 0; |
723 | 469 | ||
724 | con_out_kvec_reset(con); | 470 | con->out_kvec_bytes = 0; |
725 | con->out_kvec_is_msg = true; | 471 | con->out_kvec_is_msg = true; |
726 | con->out_msg_done = false; | 472 | con->out_msg_done = false; |
727 | 473 | ||
@@ -729,16 +475,17 @@ static void prepare_write_message(struct ceph_connection *con) | |||
729 | * TCP packet that's a good thing. */ | 475 | * TCP packet that's a good thing. */ |
730 | if (con->in_seq > con->in_seq_acked) { | 476 | if (con->in_seq > con->in_seq_acked) { |
731 | con->in_seq_acked = con->in_seq; | 477 | con->in_seq_acked = con->in_seq; |
732 | con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); | 478 | con->out_kvec[v].iov_base = &tag_ack; |
479 | con->out_kvec[v++].iov_len = 1; | ||
733 | con->out_temp_ack = cpu_to_le64(con->in_seq_acked); | 480 | con->out_temp_ack = cpu_to_le64(con->in_seq_acked); |
734 | con_out_kvec_add(con, sizeof (con->out_temp_ack), | 481 | con->out_kvec[v].iov_base = &con->out_temp_ack; |
735 | &con->out_temp_ack); | 482 | con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); |
483 | con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); | ||
736 | } | 484 | } |
737 | 485 | ||
738 | BUG_ON(list_empty(&con->out_queue)); | 486 | m = list_first_entry(&con->out_queue, |
739 | m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); | 487 | struct ceph_msg, list_head); |
740 | con->out_msg = m; | 488 | con->out_msg = m; |
741 | BUG_ON(m->con != con); | ||
742 | 489 | ||
743 | /* put message on sent list */ | 490 | /* put message on sent list */ |
744 | ceph_msg_get(m); | 491 | ceph_msg_get(m); |
@@ -752,10 +499,6 @@ static void prepare_write_message(struct ceph_connection *con) | |||
752 | m->hdr.seq = cpu_to_le64(++con->out_seq); | 499 | m->hdr.seq = cpu_to_le64(++con->out_seq); |
753 | m->needs_out_seq = false; | 500 | m->needs_out_seq = false; |
754 | } | 501 | } |
755 | #ifdef CONFIG_BLOCK | ||
756 | else | ||
757 | m->bio_iter = NULL; | ||
758 | #endif | ||
759 | 502 | ||
760 | dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", | 503 | dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", |
761 | m, con->out_seq, le16_to_cpu(m->hdr.type), | 504 | m, con->out_seq, le16_to_cpu(m->hdr.type), |
@@ -765,40 +508,53 @@ static void prepare_write_message(struct ceph_connection *con) | |||
765 | BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); | 508 | BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); |
766 | 509 | ||
767 | /* tag + hdr + front + middle */ | 510 | /* tag + hdr + front + middle */ |
768 | con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); | 511 | con->out_kvec[v].iov_base = &tag_msg; |
769 | con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); | 512 | con->out_kvec[v++].iov_len = 1; |
770 | con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); | 513 | con->out_kvec[v].iov_base = &m->hdr; |
771 | 514 | con->out_kvec[v++].iov_len = sizeof(m->hdr); | |
515 | con->out_kvec[v++] = m->front; | ||
772 | if (m->middle) | 516 | if (m->middle) |
773 | con_out_kvec_add(con, m->middle->vec.iov_len, | 517 | con->out_kvec[v++] = m->middle->vec; |
774 | m->middle->vec.iov_base); | 518 | con->out_kvec_left = v; |
519 | con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + | ||
520 | (m->middle ? m->middle->vec.iov_len : 0); | ||
521 | con->out_kvec_cur = con->out_kvec; | ||
775 | 522 | ||
776 | /* fill in crc (except data pages), footer */ | 523 | /* fill in crc (except data pages), footer */ |
777 | crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); | 524 | con->out_msg->hdr.crc = |
778 | con->out_msg->hdr.crc = cpu_to_le32(crc); | 525 | cpu_to_le32(crc32c(0, (void *)&m->hdr, |
779 | con->out_msg->footer.flags = 0; | 526 | sizeof(m->hdr) - sizeof(m->hdr.crc))); |
780 | 527 | con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; | |
781 | crc = crc32c(0, m->front.iov_base, m->front.iov_len); | 528 | con->out_msg->footer.front_crc = |
782 | con->out_msg->footer.front_crc = cpu_to_le32(crc); | 529 | cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); |
783 | if (m->middle) { | 530 | if (m->middle) |
784 | crc = crc32c(0, m->middle->vec.iov_base, | 531 | con->out_msg->footer.middle_crc = |
785 | m->middle->vec.iov_len); | 532 | cpu_to_le32(crc32c(0, m->middle->vec.iov_base, |
786 | con->out_msg->footer.middle_crc = cpu_to_le32(crc); | 533 | m->middle->vec.iov_len)); |
787 | } else | 534 | else |
788 | con->out_msg->footer.middle_crc = 0; | 535 | con->out_msg->footer.middle_crc = 0; |
789 | dout("%s front_crc %u middle_crc %u\n", __func__, | 536 | con->out_msg->footer.data_crc = 0; |
537 | dout("prepare_write_message front_crc %u data_crc %u\n", | ||
790 | le32_to_cpu(con->out_msg->footer.front_crc), | 538 | le32_to_cpu(con->out_msg->footer.front_crc), |
791 | le32_to_cpu(con->out_msg->footer.middle_crc)); | 539 | le32_to_cpu(con->out_msg->footer.middle_crc)); |
792 | 540 | ||
793 | /* is there a data payload? */ | 541 | /* is there a data payload? */ |
794 | con->out_msg->footer.data_crc = 0; | 542 | if (le32_to_cpu(m->hdr.data_len) > 0) { |
795 | if (m->hdr.data_len) | 543 | /* initialize page iterator */ |
796 | prepare_write_message_data(con); | 544 | con->out_msg_pos.page = 0; |
797 | else | 545 | if (m->pages) |
546 | con->out_msg_pos.page_pos = m->page_alignment; | ||
547 | else | ||
548 | con->out_msg_pos.page_pos = 0; | ||
549 | con->out_msg_pos.data_pos = 0; | ||
550 | con->out_msg_pos.did_page_crc = 0; | ||
551 | con->out_more = 1; /* data + footer will follow */ | ||
552 | } else { | ||
798 | /* no, queue up footer too and be done */ | 553 | /* no, queue up footer too and be done */ |
799 | prepare_write_message_footer(con); | 554 | prepare_write_message_footer(con, v); |
555 | } | ||
800 | 556 | ||
801 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 557 | set_bit(WRITE_PENDING, &con->state); |
802 | } | 558 | } |
803 | 559 | ||
804 | /* | 560 | /* |
@@ -810,16 +566,16 @@ static void prepare_write_ack(struct ceph_connection *con) | |||
810 | con->in_seq_acked, con->in_seq); | 566 | con->in_seq_acked, con->in_seq); |
811 | con->in_seq_acked = con->in_seq; | 567 | con->in_seq_acked = con->in_seq; |
812 | 568 | ||
813 | con_out_kvec_reset(con); | 569 | con->out_kvec[0].iov_base = &tag_ack; |
814 | 570 | con->out_kvec[0].iov_len = 1; | |
815 | con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); | ||
816 | |||
817 | con->out_temp_ack = cpu_to_le64(con->in_seq_acked); | 571 | con->out_temp_ack = cpu_to_le64(con->in_seq_acked); |
818 | con_out_kvec_add(con, sizeof (con->out_temp_ack), | 572 | con->out_kvec[1].iov_base = &con->out_temp_ack; |
819 | &con->out_temp_ack); | 573 | con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); |
820 | 574 | con->out_kvec_left = 2; | |
575 | con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); | ||
576 | con->out_kvec_cur = con->out_kvec; | ||
821 | con->out_more = 1; /* more will follow.. eventually.. */ | 577 | con->out_more = 1; /* more will follow.. eventually.. */ |
822 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 578 | set_bit(WRITE_PENDING, &con->state); |
823 | } | 579 | } |
824 | 580 | ||
825 | /* | 581 | /* |
@@ -828,60 +584,73 @@ static void prepare_write_ack(struct ceph_connection *con) | |||
828 | static void prepare_write_keepalive(struct ceph_connection *con) | 584 | static void prepare_write_keepalive(struct ceph_connection *con) |
829 | { | 585 | { |
830 | dout("prepare_write_keepalive %p\n", con); | 586 | dout("prepare_write_keepalive %p\n", con); |
831 | con_out_kvec_reset(con); | 587 | con->out_kvec[0].iov_base = &tag_keepalive; |
832 | con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); | 588 | con->out_kvec[0].iov_len = 1; |
833 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 589 | con->out_kvec_left = 1; |
590 | con->out_kvec_bytes = 1; | ||
591 | con->out_kvec_cur = con->out_kvec; | ||
592 | set_bit(WRITE_PENDING, &con->state); | ||
834 | } | 593 | } |
835 | 594 | ||
836 | /* | 595 | /* |
837 | * Connection negotiation. | 596 | * Connection negotiation. |
838 | */ | 597 | */ |
839 | 598 | ||
840 | static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, | 599 | static int prepare_connect_authorizer(struct ceph_connection *con) |
841 | int *auth_proto) | ||
842 | { | 600 | { |
843 | struct ceph_auth_handshake *auth; | 601 | void *auth_buf; |
844 | 602 | int auth_len = 0; | |
845 | if (!con->ops->get_authorizer) { | 603 | int auth_protocol = 0; |
846 | con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; | ||
847 | con->out_connect.authorizer_len = 0; | ||
848 | return NULL; | ||
849 | } | ||
850 | 604 | ||
851 | /* Can't hold the mutex while getting authorizer */ | ||
852 | mutex_unlock(&con->mutex); | 605 | mutex_unlock(&con->mutex); |
853 | auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); | 606 | if (con->ops->get_authorizer) |
607 | con->ops->get_authorizer(con, &auth_buf, &auth_len, | ||
608 | &auth_protocol, &con->auth_reply_buf, | ||
609 | &con->auth_reply_buf_len, | ||
610 | con->auth_retry); | ||
854 | mutex_lock(&con->mutex); | 611 | mutex_lock(&con->mutex); |
855 | 612 | ||
856 | if (IS_ERR(auth)) | 613 | if (test_bit(CLOSED, &con->state) || |
857 | return auth; | 614 | test_bit(OPENING, &con->state)) |
858 | if (con->state != CON_STATE_NEGOTIATING) | 615 | return -EAGAIN; |
859 | return ERR_PTR(-EAGAIN); | ||
860 | 616 | ||
861 | con->auth_reply_buf = auth->authorizer_reply_buf; | 617 | con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); |
862 | con->auth_reply_buf_len = auth->authorizer_reply_buf_len; | 618 | con->out_connect.authorizer_len = cpu_to_le32(auth_len); |
863 | return auth; | 619 | |
620 | if (auth_len) { | ||
621 | con->out_kvec[con->out_kvec_left].iov_base = auth_buf; | ||
622 | con->out_kvec[con->out_kvec_left].iov_len = auth_len; | ||
623 | con->out_kvec_left++; | ||
624 | con->out_kvec_bytes += auth_len; | ||
625 | } | ||
626 | return 0; | ||
864 | } | 627 | } |
865 | 628 | ||
866 | /* | 629 | /* |
867 | * We connected to a peer and are saying hello. | 630 | * We connected to a peer and are saying hello. |
868 | */ | 631 | */ |
869 | static void prepare_write_banner(struct ceph_connection *con) | 632 | static void prepare_write_banner(struct ceph_messenger *msgr, |
870 | { | 633 | struct ceph_connection *con) |
871 | con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); | 634 | { |
872 | con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), | 635 | int len = strlen(CEPH_BANNER); |
873 | &con->msgr->my_enc_addr); | 636 | |
874 | 637 | con->out_kvec[0].iov_base = CEPH_BANNER; | |
638 | con->out_kvec[0].iov_len = len; | ||
639 | con->out_kvec[1].iov_base = &msgr->my_enc_addr; | ||
640 | con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); | ||
641 | con->out_kvec_left = 2; | ||
642 | con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); | ||
643 | con->out_kvec_cur = con->out_kvec; | ||
875 | con->out_more = 0; | 644 | con->out_more = 0; |
876 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 645 | set_bit(WRITE_PENDING, &con->state); |
877 | } | 646 | } |
878 | 647 | ||
879 | static int prepare_write_connect(struct ceph_connection *con) | 648 | static int prepare_write_connect(struct ceph_messenger *msgr, |
649 | struct ceph_connection *con, | ||
650 | int after_banner) | ||
880 | { | 651 | { |
881 | unsigned int global_seq = get_global_seq(con->msgr, 0); | 652 | unsigned global_seq = get_global_seq(con->msgr, 0); |
882 | int proto; | 653 | int proto; |
883 | int auth_proto; | ||
884 | struct ceph_auth_handshake *auth; | ||
885 | 654 | ||
886 | switch (con->peer_name.type) { | 655 | switch (con->peer_name.type) { |
887 | case CEPH_ENTITY_TYPE_MON: | 656 | case CEPH_ENTITY_TYPE_MON: |
@@ -900,34 +669,29 @@ static int prepare_write_connect(struct ceph_connection *con) | |||
900 | dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, | 669 | dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, |
901 | con->connect_seq, global_seq, proto); | 670 | con->connect_seq, global_seq, proto); |
902 | 671 | ||
903 | con->out_connect.features = cpu_to_le64(con->msgr->supported_features); | 672 | con->out_connect.features = cpu_to_le64(msgr->supported_features); |
904 | con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); | 673 | con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); |
905 | con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); | 674 | con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); |
906 | con->out_connect.global_seq = cpu_to_le32(global_seq); | 675 | con->out_connect.global_seq = cpu_to_le32(global_seq); |
907 | con->out_connect.protocol_version = cpu_to_le32(proto); | 676 | con->out_connect.protocol_version = cpu_to_le32(proto); |
908 | con->out_connect.flags = 0; | 677 | con->out_connect.flags = 0; |
909 | 678 | ||
910 | auth_proto = CEPH_AUTH_UNKNOWN; | 679 | if (!after_banner) { |
911 | auth = get_connect_authorizer(con, &auth_proto); | 680 | con->out_kvec_left = 0; |
912 | if (IS_ERR(auth)) | 681 | con->out_kvec_bytes = 0; |
913 | return PTR_ERR(auth); | 682 | } |
914 | 683 | con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; | |
915 | con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); | 684 | con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); |
916 | con->out_connect.authorizer_len = auth ? | 685 | con->out_kvec_left++; |
917 | cpu_to_le32(auth->authorizer_buf_len) : 0; | 686 | con->out_kvec_bytes += sizeof(con->out_connect); |
918 | 687 | con->out_kvec_cur = con->out_kvec; | |
919 | con_out_kvec_add(con, sizeof (con->out_connect), | ||
920 | &con->out_connect); | ||
921 | if (auth && auth->authorizer_buf_len) | ||
922 | con_out_kvec_add(con, auth->authorizer_buf_len, | ||
923 | auth->authorizer_buf); | ||
924 | |||
925 | con->out_more = 0; | 688 | con->out_more = 0; |
926 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 689 | set_bit(WRITE_PENDING, &con->state); |
927 | 690 | ||
928 | return 0; | 691 | return prepare_connect_authorizer(con); |
929 | } | 692 | } |
930 | 693 | ||
694 | |||
931 | /* | 695 | /* |
932 | * write as much of pending kvecs to the socket as we can. | 696 | * write as much of pending kvecs to the socket as we can. |
933 | * 1 -> done | 697 | * 1 -> done |
@@ -948,18 +712,17 @@ static int write_partial_kvec(struct ceph_connection *con) | |||
948 | con->out_kvec_bytes -= ret; | 712 | con->out_kvec_bytes -= ret; |
949 | if (con->out_kvec_bytes == 0) | 713 | if (con->out_kvec_bytes == 0) |
950 | break; /* done */ | 714 | break; /* done */ |
951 | 715 | while (ret > 0) { | |
952 | /* account for full iov entries consumed */ | 716 | if (ret >= con->out_kvec_cur->iov_len) { |
953 | while (ret >= con->out_kvec_cur->iov_len) { | 717 | ret -= con->out_kvec_cur->iov_len; |
954 | BUG_ON(!con->out_kvec_left); | 718 | con->out_kvec_cur++; |
955 | ret -= con->out_kvec_cur->iov_len; | 719 | con->out_kvec_left--; |
956 | con->out_kvec_cur++; | 720 | } else { |
957 | con->out_kvec_left--; | 721 | con->out_kvec_cur->iov_len -= ret; |
958 | } | 722 | con->out_kvec_cur->iov_base += ret; |
959 | /* and for a partially-consumed entry */ | 723 | ret = 0; |
960 | if (ret) { | 724 | break; |
961 | con->out_kvec_cur->iov_len -= ret; | 725 | } |
962 | con->out_kvec_cur->iov_base += ret; | ||
963 | } | 726 | } |
964 | } | 727 | } |
965 | con->out_kvec_left = 0; | 728 | con->out_kvec_left = 0; |
@@ -971,34 +734,30 @@ out: | |||
971 | return ret; /* done! */ | 734 | return ret; /* done! */ |
972 | } | 735 | } |
973 | 736 | ||
974 | static void out_msg_pos_next(struct ceph_connection *con, struct page *page, | 737 | #ifdef CONFIG_BLOCK |
975 | size_t len, size_t sent, bool in_trail) | 738 | static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) |
976 | { | 739 | { |
977 | struct ceph_msg *msg = con->out_msg; | 740 | if (!bio) { |
978 | 741 | *iter = NULL; | |
979 | BUG_ON(!msg); | 742 | *seg = 0; |
980 | BUG_ON(!sent); | 743 | return; |
744 | } | ||
745 | *iter = bio; | ||
746 | *seg = bio->bi_idx; | ||
747 | } | ||
981 | 748 | ||
982 | con->out_msg_pos.data_pos += sent; | 749 | static void iter_bio_next(struct bio **bio_iter, int *seg) |
983 | con->out_msg_pos.page_pos += sent; | 750 | { |
984 | if (sent < len) | 751 | if (*bio_iter == NULL) |
985 | return; | 752 | return; |
986 | 753 | ||
987 | BUG_ON(sent != len); | 754 | BUG_ON(*seg >= (*bio_iter)->bi_vcnt); |
988 | con->out_msg_pos.page_pos = 0; | 755 | |
989 | con->out_msg_pos.page++; | 756 | (*seg)++; |
990 | con->out_msg_pos.did_page_crc = false; | 757 | if (*seg == (*bio_iter)->bi_vcnt) |
991 | if (in_trail) | 758 | init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); |
992 | list_move_tail(&page->lru, | ||
993 | &msg->trail->head); | ||
994 | else if (msg->pagelist) | ||
995 | list_move_tail(&page->lru, | ||
996 | &msg->pagelist->head); | ||
997 | #ifdef CONFIG_BLOCK | ||
998 | else if (msg->bio) | ||
999 | iter_bio_next(&msg->bio_iter, &msg->bio_seg); | ||
1000 | #endif | ||
1001 | } | 759 | } |
760 | #endif | ||
1002 | 761 | ||
1003 | /* | 762 | /* |
1004 | * Write as much message data payload as we can. If we finish, queue | 763 | * Write as much message data payload as we can. If we finish, queue |
@@ -1010,90 +769,129 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, | |||
1010 | static int write_partial_msg_pages(struct ceph_connection *con) | 769 | static int write_partial_msg_pages(struct ceph_connection *con) |
1011 | { | 770 | { |
1012 | struct ceph_msg *msg = con->out_msg; | 771 | struct ceph_msg *msg = con->out_msg; |
1013 | unsigned int data_len = le32_to_cpu(msg->hdr.data_len); | 772 | unsigned data_len = le32_to_cpu(msg->hdr.data_len); |
1014 | size_t len; | 773 | size_t len; |
1015 | bool do_datacrc = !con->msgr->nocrc; | 774 | int crc = con->msgr->nocrc; |
1016 | int ret; | 775 | int ret; |
1017 | int total_max_write; | 776 | int total_max_write; |
1018 | bool in_trail = false; | 777 | int in_trail = 0; |
1019 | const size_t trail_len = (msg->trail ? msg->trail->length : 0); | 778 | size_t trail_len = (msg->trail ? msg->trail->length : 0); |
1020 | const size_t trail_off = data_len - trail_len; | ||
1021 | 779 | ||
1022 | dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", | 780 | dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", |
1023 | con, msg, con->out_msg_pos.page, msg->nr_pages, | 781 | con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, |
1024 | con->out_msg_pos.page_pos); | 782 | con->out_msg_pos.page_pos); |
1025 | 783 | ||
1026 | /* | 784 | #ifdef CONFIG_BLOCK |
1027 | * Iterate through each page that contains data to be | 785 | if (msg->bio && !msg->bio_iter) |
1028 | * written, and send as much as possible for each. | 786 | init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); |
1029 | * | 787 | #endif |
1030 | * If we are calculating the data crc (the default), we will | 788 | |
1031 | * need to map the page. If we have no pages, they have | ||
1032 | * been revoked, so use the zero page. | ||
1033 | */ | ||
1034 | while (data_len > con->out_msg_pos.data_pos) { | 789 | while (data_len > con->out_msg_pos.data_pos) { |
1035 | struct page *page = NULL; | 790 | struct page *page = NULL; |
791 | void *kaddr = NULL; | ||
1036 | int max_write = PAGE_SIZE; | 792 | int max_write = PAGE_SIZE; |
1037 | int bio_offset = 0; | 793 | int page_shift = 0; |
1038 | 794 | ||
1039 | in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; | 795 | total_max_write = data_len - trail_len - |
1040 | if (!in_trail) | 796 | con->out_msg_pos.data_pos; |
1041 | total_max_write = trail_off - con->out_msg_pos.data_pos; | 797 | |
798 | /* | ||
799 | * if we are calculating the data crc (the default), we need | ||
800 | * to map the page. if our pages[] has been revoked, use the | ||
801 | * zero page. | ||
802 | */ | ||
803 | |||
804 | /* have we reached the trail part of the data? */ | ||
805 | if (con->out_msg_pos.data_pos >= data_len - trail_len) { | ||
806 | in_trail = 1; | ||
1042 | 807 | ||
1043 | if (in_trail) { | ||
1044 | total_max_write = data_len - con->out_msg_pos.data_pos; | 808 | total_max_write = data_len - con->out_msg_pos.data_pos; |
1045 | 809 | ||
1046 | page = list_first_entry(&msg->trail->head, | 810 | page = list_first_entry(&msg->trail->head, |
1047 | struct page, lru); | 811 | struct page, lru); |
812 | if (crc) | ||
813 | kaddr = kmap(page); | ||
814 | max_write = PAGE_SIZE; | ||
1048 | } else if (msg->pages) { | 815 | } else if (msg->pages) { |
1049 | page = msg->pages[con->out_msg_pos.page]; | 816 | page = msg->pages[con->out_msg_pos.page]; |
817 | if (crc) | ||
818 | kaddr = kmap(page); | ||
1050 | } else if (msg->pagelist) { | 819 | } else if (msg->pagelist) { |
1051 | page = list_first_entry(&msg->pagelist->head, | 820 | page = list_first_entry(&msg->pagelist->head, |
1052 | struct page, lru); | 821 | struct page, lru); |
822 | if (crc) | ||
823 | kaddr = kmap(page); | ||
1053 | #ifdef CONFIG_BLOCK | 824 | #ifdef CONFIG_BLOCK |
1054 | } else if (msg->bio) { | 825 | } else if (msg->bio) { |
1055 | struct bio_vec *bv; | 826 | struct bio_vec *bv; |
1056 | 827 | ||
1057 | bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); | 828 | bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); |
1058 | page = bv->bv_page; | 829 | page = bv->bv_page; |
1059 | bio_offset = bv->bv_offset; | 830 | page_shift = bv->bv_offset; |
831 | if (crc) | ||
832 | kaddr = kmap(page) + page_shift; | ||
1060 | max_write = bv->bv_len; | 833 | max_write = bv->bv_len; |
1061 | #endif | 834 | #endif |
1062 | } else { | 835 | } else { |
1063 | page = zero_page; | 836 | page = con->msgr->zero_page; |
837 | if (crc) | ||
838 | kaddr = page_address(con->msgr->zero_page); | ||
1064 | } | 839 | } |
1065 | len = min_t(int, max_write - con->out_msg_pos.page_pos, | 840 | len = min_t(int, max_write - con->out_msg_pos.page_pos, |
1066 | total_max_write); | 841 | total_max_write); |
1067 | 842 | ||
1068 | if (do_datacrc && !con->out_msg_pos.did_page_crc) { | 843 | if (crc && !con->out_msg_pos.did_page_crc) { |
1069 | void *base; | 844 | void *base = kaddr + con->out_msg_pos.page_pos; |
1070 | u32 crc = le32_to_cpu(msg->footer.data_crc); | 845 | u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); |
1071 | char *kaddr; | ||
1072 | 846 | ||
1073 | kaddr = kmap(page); | ||
1074 | BUG_ON(kaddr == NULL); | 847 | BUG_ON(kaddr == NULL); |
1075 | base = kaddr + con->out_msg_pos.page_pos + bio_offset; | 848 | con->out_msg->footer.data_crc = |
1076 | crc = crc32c(crc, base, len); | 849 | cpu_to_le32(crc32c(tmpcrc, base, len)); |
1077 | kunmap(page); | 850 | con->out_msg_pos.did_page_crc = 1; |
1078 | msg->footer.data_crc = cpu_to_le32(crc); | ||
1079 | con->out_msg_pos.did_page_crc = true; | ||
1080 | } | 851 | } |
1081 | ret = ceph_tcp_sendpage(con->sock, page, | 852 | ret = kernel_sendpage(con->sock, page, |
1082 | con->out_msg_pos.page_pos + bio_offset, | 853 | con->out_msg_pos.page_pos + page_shift, |
1083 | len, 1); | 854 | len, |
855 | MSG_DONTWAIT | MSG_NOSIGNAL | | ||
856 | MSG_MORE); | ||
857 | |||
858 | if (crc && | ||
859 | (msg->pages || msg->pagelist || msg->bio || in_trail)) | ||
860 | kunmap(page); | ||
861 | |||
862 | if (ret == -EAGAIN) | ||
863 | ret = 0; | ||
1084 | if (ret <= 0) | 864 | if (ret <= 0) |
1085 | goto out; | 865 | goto out; |
1086 | 866 | ||
1087 | out_msg_pos_next(con, page, len, (size_t) ret, in_trail); | 867 | con->out_msg_pos.data_pos += ret; |
868 | con->out_msg_pos.page_pos += ret; | ||
869 | if (ret == len) { | ||
870 | con->out_msg_pos.page_pos = 0; | ||
871 | con->out_msg_pos.page++; | ||
872 | con->out_msg_pos.did_page_crc = 0; | ||
873 | if (in_trail) | ||
874 | list_move_tail(&page->lru, | ||
875 | &msg->trail->head); | ||
876 | else if (msg->pagelist) | ||
877 | list_move_tail(&page->lru, | ||
878 | &msg->pagelist->head); | ||
879 | #ifdef CONFIG_BLOCK | ||
880 | else if (msg->bio) | ||
881 | iter_bio_next(&msg->bio_iter, &msg->bio_seg); | ||
882 | #endif | ||
883 | } | ||
1088 | } | 884 | } |
1089 | 885 | ||
1090 | dout("write_partial_msg_pages %p msg %p done\n", con, msg); | 886 | dout("write_partial_msg_pages %p msg %p done\n", con, msg); |
1091 | 887 | ||
1092 | /* prepare and queue up footer, too */ | 888 | /* prepare and queue up footer, too */ |
1093 | if (!do_datacrc) | 889 | if (!crc) |
1094 | msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; | 890 | con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; |
1095 | con_out_kvec_reset(con); | 891 | con->out_kvec_bytes = 0; |
1096 | prepare_write_message_footer(con); | 892 | con->out_kvec_left = 0; |
893 | con->out_kvec_cur = con->out_kvec; | ||
894 | prepare_write_message_footer(con, 0); | ||
1097 | ret = 1; | 895 | ret = 1; |
1098 | out: | 896 | out: |
1099 | return ret; | 897 | return ret; |
@@ -1107,9 +905,12 @@ static int write_partial_skip(struct ceph_connection *con) | |||
1107 | int ret; | 905 | int ret; |
1108 | 906 | ||
1109 | while (con->out_skip > 0) { | 907 | while (con->out_skip > 0) { |
1110 | size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); | 908 | struct kvec iov = { |
909 | .iov_base = page_address(con->msgr->zero_page), | ||
910 | .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) | ||
911 | }; | ||
1111 | 912 | ||
1112 | ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); | 913 | ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); |
1113 | if (ret <= 0) | 914 | if (ret <= 0) |
1114 | goto out; | 915 | goto out; |
1115 | con->out_skip -= ret; | 916 | con->out_skip -= ret; |
@@ -1161,10 +962,11 @@ static int prepare_read_message(struct ceph_connection *con) | |||
1161 | 962 | ||
1162 | 963 | ||
1163 | static int read_partial(struct ceph_connection *con, | 964 | static int read_partial(struct ceph_connection *con, |
1164 | int end, int size, void *object) | 965 | int *to, int size, void *object) |
1165 | { | 966 | { |
1166 | while (con->in_base_pos < end) { | 967 | *to += size; |
1167 | int left = end - con->in_base_pos; | 968 | while (con->in_base_pos < *to) { |
969 | int left = *to - con->in_base_pos; | ||
1168 | int have = size - left; | 970 | int have = size - left; |
1169 | int ret = ceph_tcp_recvmsg(con->sock, object + have, left); | 971 | int ret = ceph_tcp_recvmsg(con->sock, object + have, left); |
1170 | if (ret <= 0) | 972 | if (ret <= 0) |
@@ -1180,52 +982,37 @@ static int read_partial(struct ceph_connection *con, | |||
1180 | */ | 982 | */ |
1181 | static int read_partial_banner(struct ceph_connection *con) | 983 | static int read_partial_banner(struct ceph_connection *con) |
1182 | { | 984 | { |
1183 | int size; | 985 | int ret, to = 0; |
1184 | int end; | ||
1185 | int ret; | ||
1186 | 986 | ||
1187 | dout("read_partial_banner %p at %d\n", con, con->in_base_pos); | 987 | dout("read_partial_banner %p at %d\n", con, con->in_base_pos); |
1188 | 988 | ||
1189 | /* peer's banner */ | 989 | /* peer's banner */ |
1190 | size = strlen(CEPH_BANNER); | 990 | ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); |
1191 | end = size; | ||
1192 | ret = read_partial(con, end, size, con->in_banner); | ||
1193 | if (ret <= 0) | 991 | if (ret <= 0) |
1194 | goto out; | 992 | goto out; |
1195 | 993 | ret = read_partial(con, &to, sizeof(con->actual_peer_addr), | |
1196 | size = sizeof (con->actual_peer_addr); | 994 | &con->actual_peer_addr); |
1197 | end += size; | ||
1198 | ret = read_partial(con, end, size, &con->actual_peer_addr); | ||
1199 | if (ret <= 0) | 995 | if (ret <= 0) |
1200 | goto out; | 996 | goto out; |
1201 | 997 | ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), | |
1202 | size = sizeof (con->peer_addr_for_me); | 998 | &con->peer_addr_for_me); |
1203 | end += size; | ||
1204 | ret = read_partial(con, end, size, &con->peer_addr_for_me); | ||
1205 | if (ret <= 0) | 999 | if (ret <= 0) |
1206 | goto out; | 1000 | goto out; |
1207 | |||
1208 | out: | 1001 | out: |
1209 | return ret; | 1002 | return ret; |
1210 | } | 1003 | } |
1211 | 1004 | ||
1212 | static int read_partial_connect(struct ceph_connection *con) | 1005 | static int read_partial_connect(struct ceph_connection *con) |
1213 | { | 1006 | { |
1214 | int size; | 1007 | int ret, to = 0; |
1215 | int end; | ||
1216 | int ret; | ||
1217 | 1008 | ||
1218 | dout("read_partial_connect %p at %d\n", con, con->in_base_pos); | 1009 | dout("read_partial_connect %p at %d\n", con, con->in_base_pos); |
1219 | 1010 | ||
1220 | size = sizeof (con->in_reply); | 1011 | ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); |
1221 | end = size; | ||
1222 | ret = read_partial(con, end, size, &con->in_reply); | ||
1223 | if (ret <= 0) | 1012 | if (ret <= 0) |
1224 | goto out; | 1013 | goto out; |
1225 | 1014 | ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), | |
1226 | size = le32_to_cpu(con->in_reply.authorizer_len); | 1015 | con->auth_reply_buf); |
1227 | end += size; | ||
1228 | ret = read_partial(con, end, size, con->auth_reply_buf); | ||
1229 | if (ret <= 0) | 1016 | if (ret <= 0) |
1230 | goto out; | 1017 | goto out; |
1231 | 1018 | ||
@@ -1291,101 +1078,6 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) | |||
1291 | } | 1078 | } |
1292 | 1079 | ||
1293 | /* | 1080 | /* |
1294 | * Unlike other *_pton function semantics, zero indicates success. | ||
1295 | */ | ||
1296 | static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, | ||
1297 | char delim, const char **ipend) | ||
1298 | { | ||
1299 | struct sockaddr_in *in4 = (struct sockaddr_in *) ss; | ||
1300 | struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; | ||
1301 | |||
1302 | memset(ss, 0, sizeof(*ss)); | ||
1303 | |||
1304 | if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { | ||
1305 | ss->ss_family = AF_INET; | ||
1306 | return 0; | ||
1307 | } | ||
1308 | |||
1309 | if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { | ||
1310 | ss->ss_family = AF_INET6; | ||
1311 | return 0; | ||
1312 | } | ||
1313 | |||
1314 | return -EINVAL; | ||
1315 | } | ||
1316 | |||
1317 | /* | ||
1318 | * Extract hostname string and resolve using kernel DNS facility. | ||
1319 | */ | ||
1320 | #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER | ||
1321 | static int ceph_dns_resolve_name(const char *name, size_t namelen, | ||
1322 | struct sockaddr_storage *ss, char delim, const char **ipend) | ||
1323 | { | ||
1324 | const char *end, *delim_p; | ||
1325 | char *colon_p, *ip_addr = NULL; | ||
1326 | int ip_len, ret; | ||
1327 | |||
1328 | /* | ||
1329 | * The end of the hostname occurs immediately preceding the delimiter or | ||
1330 | * the port marker (':') where the delimiter takes precedence. | ||
1331 | */ | ||
1332 | delim_p = memchr(name, delim, namelen); | ||
1333 | colon_p = memchr(name, ':', namelen); | ||
1334 | |||
1335 | if (delim_p && colon_p) | ||
1336 | end = delim_p < colon_p ? delim_p : colon_p; | ||
1337 | else if (!delim_p && colon_p) | ||
1338 | end = colon_p; | ||
1339 | else { | ||
1340 | end = delim_p; | ||
1341 | if (!end) /* case: hostname:/ */ | ||
1342 | end = name + namelen; | ||
1343 | } | ||
1344 | |||
1345 | if (end <= name) | ||
1346 | return -EINVAL; | ||
1347 | |||
1348 | /* do dns_resolve upcall */ | ||
1349 | ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); | ||
1350 | if (ip_len > 0) | ||
1351 | ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); | ||
1352 | else | ||
1353 | ret = -ESRCH; | ||
1354 | |||
1355 | kfree(ip_addr); | ||
1356 | |||
1357 | *ipend = end; | ||
1358 | |||
1359 | pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, | ||
1360 | ret, ret ? "failed" : ceph_pr_addr(ss)); | ||
1361 | |||
1362 | return ret; | ||
1363 | } | ||
1364 | #else | ||
1365 | static inline int ceph_dns_resolve_name(const char *name, size_t namelen, | ||
1366 | struct sockaddr_storage *ss, char delim, const char **ipend) | ||
1367 | { | ||
1368 | return -EINVAL; | ||
1369 | } | ||
1370 | #endif | ||
1371 | |||
1372 | /* | ||
1373 | * Parse a server name (IP or hostname). If a valid IP address is not found | ||
1374 | * then try to extract a hostname to resolve using userspace DNS upcall. | ||
1375 | */ | ||
1376 | static int ceph_parse_server_name(const char *name, size_t namelen, | ||
1377 | struct sockaddr_storage *ss, char delim, const char **ipend) | ||
1378 | { | ||
1379 | int ret; | ||
1380 | |||
1381 | ret = ceph_pton(name, namelen, ss, delim, ipend); | ||
1382 | if (ret) | ||
1383 | ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); | ||
1384 | |||
1385 | return ret; | ||
1386 | } | ||
1387 | |||
1388 | /* | ||
1389 | * Parse an ip[:port] list into an addr array. Use the default | 1081 | * Parse an ip[:port] list into an addr array. Use the default |
1390 | * monitor port if a port isn't specified. | 1082 | * monitor port if a port isn't specified. |
1391 | */ | 1083 | */ |
@@ -1393,13 +1085,15 @@ int ceph_parse_ips(const char *c, const char *end, | |||
1393 | struct ceph_entity_addr *addr, | 1085 | struct ceph_entity_addr *addr, |
1394 | int max_count, int *count) | 1086 | int max_count, int *count) |
1395 | { | 1087 | { |
1396 | int i, ret = -EINVAL; | 1088 | int i; |
1397 | const char *p = c; | 1089 | const char *p = c; |
1398 | 1090 | ||
1399 | dout("parse_ips on '%.*s'\n", (int)(end-c), c); | 1091 | dout("parse_ips on '%.*s'\n", (int)(end-c), c); |
1400 | for (i = 0; i < max_count; i++) { | 1092 | for (i = 0; i < max_count; i++) { |
1401 | const char *ipend; | 1093 | const char *ipend; |
1402 | struct sockaddr_storage *ss = &addr[i].in_addr; | 1094 | struct sockaddr_storage *ss = &addr[i].in_addr; |
1095 | struct sockaddr_in *in4 = (void *)ss; | ||
1096 | struct sockaddr_in6 *in6 = (void *)ss; | ||
1403 | int port; | 1097 | int port; |
1404 | char delim = ','; | 1098 | char delim = ','; |
1405 | 1099 | ||
@@ -1408,11 +1102,15 @@ int ceph_parse_ips(const char *c, const char *end, | |||
1408 | p++; | 1102 | p++; |
1409 | } | 1103 | } |
1410 | 1104 | ||
1411 | ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); | 1105 | memset(ss, 0, sizeof(*ss)); |
1412 | if (ret) | 1106 | if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, |
1107 | delim, &ipend)) | ||
1108 | ss->ss_family = AF_INET; | ||
1109 | else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, | ||
1110 | delim, &ipend)) | ||
1111 | ss->ss_family = AF_INET6; | ||
1112 | else | ||
1413 | goto bad; | 1113 | goto bad; |
1414 | ret = -EINVAL; | ||
1415 | |||
1416 | p = ipend; | 1114 | p = ipend; |
1417 | 1115 | ||
1418 | if (delim == ']') { | 1116 | if (delim == ']') { |
@@ -1457,7 +1155,7 @@ int ceph_parse_ips(const char *c, const char *end, | |||
1457 | 1155 | ||
1458 | bad: | 1156 | bad: |
1459 | pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); | 1157 | pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); |
1460 | return ret; | 1158 | return -EINVAL; |
1461 | } | 1159 | } |
1462 | EXPORT_SYMBOL(ceph_parse_ips); | 1160 | EXPORT_SYMBOL(ceph_parse_ips); |
1463 | 1161 | ||
@@ -1504,9 +1202,22 @@ static int process_banner(struct ceph_connection *con) | |||
1504 | ceph_pr_addr(&con->msgr->inst.addr.in_addr)); | 1202 | ceph_pr_addr(&con->msgr->inst.addr.in_addr)); |
1505 | } | 1203 | } |
1506 | 1204 | ||
1205 | set_bit(NEGOTIATING, &con->state); | ||
1206 | prepare_read_connect(con); | ||
1507 | return 0; | 1207 | return 0; |
1508 | } | 1208 | } |
1509 | 1209 | ||
1210 | static void fail_protocol(struct ceph_connection *con) | ||
1211 | { | ||
1212 | reset_connection(con); | ||
1213 | set_bit(CLOSED, &con->state); /* in case there's queued work */ | ||
1214 | |||
1215 | mutex_unlock(&con->mutex); | ||
1216 | if (con->ops->bad_proto) | ||
1217 | con->ops->bad_proto(con); | ||
1218 | mutex_lock(&con->mutex); | ||
1219 | } | ||
1220 | |||
1510 | static int process_connect(struct ceph_connection *con) | 1221 | static int process_connect(struct ceph_connection *con) |
1511 | { | 1222 | { |
1512 | u64 sup_feat = con->msgr->supported_features; | 1223 | u64 sup_feat = con->msgr->supported_features; |
@@ -1524,7 +1235,7 @@ static int process_connect(struct ceph_connection *con) | |||
1524 | ceph_pr_addr(&con->peer_addr.in_addr), | 1235 | ceph_pr_addr(&con->peer_addr.in_addr), |
1525 | sup_feat, server_feat, server_feat & ~sup_feat); | 1236 | sup_feat, server_feat, server_feat & ~sup_feat); |
1526 | con->error_msg = "missing required protocol features"; | 1237 | con->error_msg = "missing required protocol features"; |
1527 | reset_connection(con); | 1238 | fail_protocol(con); |
1528 | return -1; | 1239 | return -1; |
1529 | 1240 | ||
1530 | case CEPH_MSGR_TAG_BADPROTOVER: | 1241 | case CEPH_MSGR_TAG_BADPROTOVER: |
@@ -1535,7 +1246,7 @@ static int process_connect(struct ceph_connection *con) | |||
1535 | le32_to_cpu(con->out_connect.protocol_version), | 1246 | le32_to_cpu(con->out_connect.protocol_version), |
1536 | le32_to_cpu(con->in_reply.protocol_version)); | 1247 | le32_to_cpu(con->in_reply.protocol_version)); |
1537 | con->error_msg = "protocol version mismatch"; | 1248 | con->error_msg = "protocol version mismatch"; |
1538 | reset_connection(con); | 1249 | fail_protocol(con); |
1539 | return -1; | 1250 | return -1; |
1540 | 1251 | ||
1541 | case CEPH_MSGR_TAG_BADAUTHORIZER: | 1252 | case CEPH_MSGR_TAG_BADAUTHORIZER: |
@@ -1547,8 +1258,7 @@ static int process_connect(struct ceph_connection *con) | |||
1547 | return -1; | 1258 | return -1; |
1548 | } | 1259 | } |
1549 | con->auth_retry = 1; | 1260 | con->auth_retry = 1; |
1550 | con_out_kvec_reset(con); | 1261 | ret = prepare_write_connect(con->msgr, con, 0); |
1551 | ret = prepare_write_connect(con); | ||
1552 | if (ret < 0) | 1262 | if (ret < 0) |
1553 | return ret; | 1263 | return ret; |
1554 | prepare_read_connect(con); | 1264 | prepare_read_connect(con); |
@@ -1563,15 +1273,12 @@ static int process_connect(struct ceph_connection *con) | |||
1563 | * dropped messages. | 1273 | * dropped messages. |
1564 | */ | 1274 | */ |
1565 | dout("process_connect got RESET peer seq %u\n", | 1275 | dout("process_connect got RESET peer seq %u\n", |
1566 | le32_to_cpu(con->in_reply.connect_seq)); | 1276 | le32_to_cpu(con->in_connect.connect_seq)); |
1567 | pr_err("%s%lld %s connection reset\n", | 1277 | pr_err("%s%lld %s connection reset\n", |
1568 | ENTITY_NAME(con->peer_name), | 1278 | ENTITY_NAME(con->peer_name), |
1569 | ceph_pr_addr(&con->peer_addr.in_addr)); | 1279 | ceph_pr_addr(&con->peer_addr.in_addr)); |
1570 | reset_connection(con); | 1280 | reset_connection(con); |
1571 | con_out_kvec_reset(con); | 1281 | prepare_write_connect(con->msgr, con, 0); |
1572 | ret = prepare_write_connect(con); | ||
1573 | if (ret < 0) | ||
1574 | return ret; | ||
1575 | prepare_read_connect(con); | 1282 | prepare_read_connect(con); |
1576 | 1283 | ||
1577 | /* Tell ceph about it. */ | 1284 | /* Tell ceph about it. */ |
@@ -1580,7 +1287,8 @@ static int process_connect(struct ceph_connection *con) | |||
1580 | if (con->ops->peer_reset) | 1287 | if (con->ops->peer_reset) |
1581 | con->ops->peer_reset(con); | 1288 | con->ops->peer_reset(con); |
1582 | mutex_lock(&con->mutex); | 1289 | mutex_lock(&con->mutex); |
1583 | if (con->state != CON_STATE_NEGOTIATING) | 1290 | if (test_bit(CLOSED, &con->state) || |
1291 | test_bit(OPENING, &con->state)) | ||
1584 | return -EAGAIN; | 1292 | return -EAGAIN; |
1585 | break; | 1293 | break; |
1586 | 1294 | ||
@@ -1589,14 +1297,11 @@ static int process_connect(struct ceph_connection *con) | |||
1589 | * If we sent a smaller connect_seq than the peer has, try | 1297 | * If we sent a smaller connect_seq than the peer has, try |
1590 | * again with a larger value. | 1298 | * again with a larger value. |
1591 | */ | 1299 | */ |
1592 | dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", | 1300 | dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", |
1593 | le32_to_cpu(con->out_connect.connect_seq), | 1301 | le32_to_cpu(con->out_connect.connect_seq), |
1594 | le32_to_cpu(con->in_reply.connect_seq)); | 1302 | le32_to_cpu(con->in_connect.connect_seq)); |
1595 | con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); | 1303 | con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); |
1596 | con_out_kvec_reset(con); | 1304 | prepare_write_connect(con->msgr, con, 0); |
1597 | ret = prepare_write_connect(con); | ||
1598 | if (ret < 0) | ||
1599 | return ret; | ||
1600 | prepare_read_connect(con); | 1305 | prepare_read_connect(con); |
1601 | break; | 1306 | break; |
1602 | 1307 | ||
@@ -1607,13 +1312,10 @@ static int process_connect(struct ceph_connection *con) | |||
1607 | */ | 1312 | */ |
1608 | dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", | 1313 | dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", |
1609 | con->peer_global_seq, | 1314 | con->peer_global_seq, |
1610 | le32_to_cpu(con->in_reply.global_seq)); | 1315 | le32_to_cpu(con->in_connect.global_seq)); |
1611 | get_global_seq(con->msgr, | 1316 | get_global_seq(con->msgr, |
1612 | le32_to_cpu(con->in_reply.global_seq)); | 1317 | le32_to_cpu(con->in_connect.global_seq)); |
1613 | con_out_kvec_reset(con); | 1318 | prepare_write_connect(con->msgr, con, 0); |
1614 | ret = prepare_write_connect(con); | ||
1615 | if (ret < 0) | ||
1616 | return ret; | ||
1617 | prepare_read_connect(con); | 1319 | prepare_read_connect(con); |
1618 | break; | 1320 | break; |
1619 | 1321 | ||
@@ -1625,13 +1327,10 @@ static int process_connect(struct ceph_connection *con) | |||
1625 | ceph_pr_addr(&con->peer_addr.in_addr), | 1327 | ceph_pr_addr(&con->peer_addr.in_addr), |
1626 | req_feat, server_feat, req_feat & ~server_feat); | 1328 | req_feat, server_feat, req_feat & ~server_feat); |
1627 | con->error_msg = "missing required protocol features"; | 1329 | con->error_msg = "missing required protocol features"; |
1628 | reset_connection(con); | 1330 | fail_protocol(con); |
1629 | return -1; | 1331 | return -1; |
1630 | } | 1332 | } |
1631 | 1333 | clear_bit(CONNECTING, &con->state); | |
1632 | WARN_ON(con->state != CON_STATE_NEGOTIATING); | ||
1633 | con->state = CON_STATE_OPEN; | ||
1634 | |||
1635 | con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); | 1334 | con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); |
1636 | con->connect_seq++; | 1335 | con->connect_seq++; |
1637 | con->peer_features = server_feat; | 1336 | con->peer_features = server_feat; |
@@ -1643,9 +1342,7 @@ static int process_connect(struct ceph_connection *con) | |||
1643 | le32_to_cpu(con->in_reply.connect_seq)); | 1342 | le32_to_cpu(con->in_reply.connect_seq)); |
1644 | 1343 | ||
1645 | if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) | 1344 | if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) |
1646 | set_bit(CON_FLAG_LOSSYTX, &con->flags); | 1345 | set_bit(LOSSYTX, &con->state); |
1647 | |||
1648 | con->delay = 0; /* reset backoff memory */ | ||
1649 | 1346 | ||
1650 | prepare_read_tag(con); | 1347 | prepare_read_tag(con); |
1651 | break; | 1348 | break; |
@@ -1675,10 +1372,10 @@ static int process_connect(struct ceph_connection *con) | |||
1675 | */ | 1372 | */ |
1676 | static int read_partial_ack(struct ceph_connection *con) | 1373 | static int read_partial_ack(struct ceph_connection *con) |
1677 | { | 1374 | { |
1678 | int size = sizeof (con->in_temp_ack); | 1375 | int to = 0; |
1679 | int end = size; | ||
1680 | 1376 | ||
1681 | return read_partial(con, end, size, &con->in_temp_ack); | 1377 | return read_partial(con, &to, sizeof(con->in_temp_ack), |
1378 | &con->in_temp_ack); | ||
1682 | } | 1379 | } |
1683 | 1380 | ||
1684 | 1381 | ||
@@ -1724,18 +1421,22 @@ static int read_partial_message_section(struct ceph_connection *con, | |||
1724 | if (ret <= 0) | 1421 | if (ret <= 0) |
1725 | return ret; | 1422 | return ret; |
1726 | section->iov_len += ret; | 1423 | section->iov_len += ret; |
1424 | if (section->iov_len == sec_len) | ||
1425 | *crc = crc32c(0, section->iov_base, | ||
1426 | section->iov_len); | ||
1727 | } | 1427 | } |
1728 | if (section->iov_len == sec_len) | ||
1729 | *crc = crc32c(0, section->iov_base, section->iov_len); | ||
1730 | 1428 | ||
1731 | return 1; | 1429 | return 1; |
1732 | } | 1430 | } |
1733 | 1431 | ||
1734 | static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); | 1432 | static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, |
1433 | struct ceph_msg_header *hdr, | ||
1434 | int *skip); | ||
1435 | |||
1735 | 1436 | ||
1736 | static int read_partial_message_pages(struct ceph_connection *con, | 1437 | static int read_partial_message_pages(struct ceph_connection *con, |
1737 | struct page **pages, | 1438 | struct page **pages, |
1738 | unsigned int data_len, bool do_datacrc) | 1439 | unsigned data_len, int datacrc) |
1739 | { | 1440 | { |
1740 | void *p; | 1441 | void *p; |
1741 | int ret; | 1442 | int ret; |
@@ -1748,7 +1449,7 @@ static int read_partial_message_pages(struct ceph_connection *con, | |||
1748 | p = kmap(pages[con->in_msg_pos.page]); | 1449 | p = kmap(pages[con->in_msg_pos.page]); |
1749 | ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, | 1450 | ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, |
1750 | left); | 1451 | left); |
1751 | if (ret > 0 && do_datacrc) | 1452 | if (ret > 0 && datacrc) |
1752 | con->in_data_crc = | 1453 | con->in_data_crc = |
1753 | crc32c(con->in_data_crc, | 1454 | crc32c(con->in_data_crc, |
1754 | p + con->in_msg_pos.page_pos, ret); | 1455 | p + con->in_msg_pos.page_pos, ret); |
@@ -1768,12 +1469,15 @@ static int read_partial_message_pages(struct ceph_connection *con, | |||
1768 | #ifdef CONFIG_BLOCK | 1469 | #ifdef CONFIG_BLOCK |
1769 | static int read_partial_message_bio(struct ceph_connection *con, | 1470 | static int read_partial_message_bio(struct ceph_connection *con, |
1770 | struct bio **bio_iter, int *bio_seg, | 1471 | struct bio **bio_iter, int *bio_seg, |
1771 | unsigned int data_len, bool do_datacrc) | 1472 | unsigned data_len, int datacrc) |
1772 | { | 1473 | { |
1773 | struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); | 1474 | struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); |
1774 | void *p; | 1475 | void *p; |
1775 | int ret, left; | 1476 | int ret, left; |
1776 | 1477 | ||
1478 | if (IS_ERR(bv)) | ||
1479 | return PTR_ERR(bv); | ||
1480 | |||
1777 | left = min((int)(data_len - con->in_msg_pos.data_pos), | 1481 | left = min((int)(data_len - con->in_msg_pos.data_pos), |
1778 | (int)(bv->bv_len - con->in_msg_pos.page_pos)); | 1482 | (int)(bv->bv_len - con->in_msg_pos.page_pos)); |
1779 | 1483 | ||
@@ -1781,7 +1485,7 @@ static int read_partial_message_bio(struct ceph_connection *con, | |||
1781 | 1485 | ||
1782 | ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, | 1486 | ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, |
1783 | left); | 1487 | left); |
1784 | if (ret > 0 && do_datacrc) | 1488 | if (ret > 0 && datacrc) |
1785 | con->in_data_crc = | 1489 | con->in_data_crc = |
1786 | crc32c(con->in_data_crc, | 1490 | crc32c(con->in_data_crc, |
1787 | p + con->in_msg_pos.page_pos, ret); | 1491 | p + con->in_msg_pos.page_pos, ret); |
@@ -1805,31 +1509,35 @@ static int read_partial_message_bio(struct ceph_connection *con, | |||
1805 | static int read_partial_message(struct ceph_connection *con) | 1509 | static int read_partial_message(struct ceph_connection *con) |
1806 | { | 1510 | { |
1807 | struct ceph_msg *m = con->in_msg; | 1511 | struct ceph_msg *m = con->in_msg; |
1808 | int size; | ||
1809 | int end; | ||
1810 | int ret; | 1512 | int ret; |
1811 | unsigned int front_len, middle_len, data_len; | 1513 | int to, left; |
1812 | bool do_datacrc = !con->msgr->nocrc; | 1514 | unsigned front_len, middle_len, data_len; |
1515 | int datacrc = con->msgr->nocrc; | ||
1516 | int skip; | ||
1813 | u64 seq; | 1517 | u64 seq; |
1814 | u32 crc; | ||
1815 | 1518 | ||
1816 | dout("read_partial_message con %p msg %p\n", con, m); | 1519 | dout("read_partial_message con %p msg %p\n", con, m); |
1817 | 1520 | ||
1818 | /* header */ | 1521 | /* header */ |
1819 | size = sizeof (con->in_hdr); | 1522 | while (con->in_base_pos < sizeof(con->in_hdr)) { |
1820 | end = size; | 1523 | left = sizeof(con->in_hdr) - con->in_base_pos; |
1821 | ret = read_partial(con, end, size, &con->in_hdr); | 1524 | ret = ceph_tcp_recvmsg(con->sock, |
1822 | if (ret <= 0) | 1525 | (char *)&con->in_hdr + con->in_base_pos, |
1823 | return ret; | 1526 | left); |
1824 | 1527 | if (ret <= 0) | |
1825 | crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); | 1528 | return ret; |
1826 | if (cpu_to_le32(crc) != con->in_hdr.crc) { | 1529 | con->in_base_pos += ret; |
1827 | pr_err("read_partial_message bad hdr " | 1530 | if (con->in_base_pos == sizeof(con->in_hdr)) { |
1828 | " crc %u != expected %u\n", | 1531 | u32 crc = crc32c(0, (void *)&con->in_hdr, |
1829 | crc, con->in_hdr.crc); | 1532 | sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); |
1830 | return -EBADMSG; | 1533 | if (crc != le32_to_cpu(con->in_hdr.crc)) { |
1534 | pr_err("read_partial_message bad hdr " | ||
1535 | " crc %u != expected %u\n", | ||
1536 | crc, con->in_hdr.crc); | ||
1537 | return -EBADMSG; | ||
1538 | } | ||
1539 | } | ||
1831 | } | 1540 | } |
1832 | |||
1833 | front_len = le32_to_cpu(con->in_hdr.front_len); | 1541 | front_len = le32_to_cpu(con->in_hdr.front_len); |
1834 | if (front_len > CEPH_MSG_MAX_FRONT_LEN) | 1542 | if (front_len > CEPH_MSG_MAX_FRONT_LEN) |
1835 | return -EIO; | 1543 | return -EIO; |
@@ -1860,13 +1568,10 @@ static int read_partial_message(struct ceph_connection *con) | |||
1860 | 1568 | ||
1861 | /* allocate message? */ | 1569 | /* allocate message? */ |
1862 | if (!con->in_msg) { | 1570 | if (!con->in_msg) { |
1863 | int skip = 0; | ||
1864 | |||
1865 | dout("got hdr type %d front %d data %d\n", con->in_hdr.type, | 1571 | dout("got hdr type %d front %d data %d\n", con->in_hdr.type, |
1866 | con->in_hdr.front_len, con->in_hdr.data_len); | 1572 | con->in_hdr.front_len, con->in_hdr.data_len); |
1867 | ret = ceph_con_in_msg_alloc(con, &skip); | 1573 | skip = 0; |
1868 | if (ret < 0) | 1574 | con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); |
1869 | return ret; | ||
1870 | if (skip) { | 1575 | if (skip) { |
1871 | /* skip this message */ | 1576 | /* skip this message */ |
1872 | dout("alloc_msg said skip message\n"); | 1577 | dout("alloc_msg said skip message\n"); |
@@ -1877,9 +1582,11 @@ static int read_partial_message(struct ceph_connection *con) | |||
1877 | con->in_seq++; | 1582 | con->in_seq++; |
1878 | return 0; | 1583 | return 0; |
1879 | } | 1584 | } |
1880 | 1585 | if (!con->in_msg) { | |
1881 | BUG_ON(!con->in_msg); | 1586 | con->error_msg = |
1882 | BUG_ON(con->in_msg->con != con); | 1587 | "error allocating memory for incoming message"; |
1588 | return -ENOMEM; | ||
1589 | } | ||
1883 | m = con->in_msg; | 1590 | m = con->in_msg; |
1884 | m->front.iov_len = 0; /* haven't read it yet */ | 1591 | m->front.iov_len = 0; /* haven't read it yet */ |
1885 | if (m->middle) | 1592 | if (m->middle) |
@@ -1891,11 +1598,6 @@ static int read_partial_message(struct ceph_connection *con) | |||
1891 | else | 1598 | else |
1892 | con->in_msg_pos.page_pos = 0; | 1599 | con->in_msg_pos.page_pos = 0; |
1893 | con->in_msg_pos.data_pos = 0; | 1600 | con->in_msg_pos.data_pos = 0; |
1894 | |||
1895 | #ifdef CONFIG_BLOCK | ||
1896 | if (m->bio) | ||
1897 | init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); | ||
1898 | #endif | ||
1899 | } | 1601 | } |
1900 | 1602 | ||
1901 | /* front */ | 1603 | /* front */ |
@@ -1912,20 +1614,24 @@ static int read_partial_message(struct ceph_connection *con) | |||
1912 | if (ret <= 0) | 1614 | if (ret <= 0) |
1913 | return ret; | 1615 | return ret; |
1914 | } | 1616 | } |
1617 | #ifdef CONFIG_BLOCK | ||
1618 | if (m->bio && !m->bio_iter) | ||
1619 | init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); | ||
1620 | #endif | ||
1915 | 1621 | ||
1916 | /* (page) data */ | 1622 | /* (page) data */ |
1917 | while (con->in_msg_pos.data_pos < data_len) { | 1623 | while (con->in_msg_pos.data_pos < data_len) { |
1918 | if (m->pages) { | 1624 | if (m->pages) { |
1919 | ret = read_partial_message_pages(con, m->pages, | 1625 | ret = read_partial_message_pages(con, m->pages, |
1920 | data_len, do_datacrc); | 1626 | data_len, datacrc); |
1921 | if (ret <= 0) | 1627 | if (ret <= 0) |
1922 | return ret; | 1628 | return ret; |
1923 | #ifdef CONFIG_BLOCK | 1629 | #ifdef CONFIG_BLOCK |
1924 | } else if (m->bio) { | 1630 | } else if (m->bio) { |
1925 | BUG_ON(!m->bio_iter); | 1631 | |
1926 | ret = read_partial_message_bio(con, | 1632 | ret = read_partial_message_bio(con, |
1927 | &m->bio_iter, &m->bio_seg, | 1633 | &m->bio_iter, &m->bio_seg, |
1928 | data_len, do_datacrc); | 1634 | data_len, datacrc); |
1929 | if (ret <= 0) | 1635 | if (ret <= 0) |
1930 | return ret; | 1636 | return ret; |
1931 | #endif | 1637 | #endif |
@@ -1935,12 +1641,16 @@ static int read_partial_message(struct ceph_connection *con) | |||
1935 | } | 1641 | } |
1936 | 1642 | ||
1937 | /* footer */ | 1643 | /* footer */ |
1938 | size = sizeof (m->footer); | 1644 | to = sizeof(m->hdr) + sizeof(m->footer); |
1939 | end += size; | 1645 | while (con->in_base_pos < to) { |
1940 | ret = read_partial(con, end, size, &m->footer); | 1646 | left = to - con->in_base_pos; |
1941 | if (ret <= 0) | 1647 | ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + |
1942 | return ret; | 1648 | (con->in_base_pos - sizeof(m->hdr)), |
1943 | 1649 | left); | |
1650 | if (ret <= 0) | ||
1651 | return ret; | ||
1652 | con->in_base_pos += ret; | ||
1653 | } | ||
1944 | dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", | 1654 | dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", |
1945 | m, front_len, m->footer.front_crc, middle_len, | 1655 | m, front_len, m->footer.front_crc, middle_len, |
1946 | m->footer.middle_crc, data_len, m->footer.data_crc); | 1656 | m->footer.middle_crc, data_len, m->footer.data_crc); |
@@ -1956,7 +1666,7 @@ static int read_partial_message(struct ceph_connection *con) | |||
1956 | m, con->in_middle_crc, m->footer.middle_crc); | 1666 | m, con->in_middle_crc, m->footer.middle_crc); |
1957 | return -EBADMSG; | 1667 | return -EBADMSG; |
1958 | } | 1668 | } |
1959 | if (do_datacrc && | 1669 | if (datacrc && |
1960 | (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && | 1670 | (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && |
1961 | con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { | 1671 | con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { |
1962 | pr_err("read_partial_message %p data crc %u != exp. %u\n", m, | 1672 | pr_err("read_partial_message %p data crc %u != exp. %u\n", m, |
@@ -1976,11 +1686,8 @@ static void process_message(struct ceph_connection *con) | |||
1976 | { | 1686 | { |
1977 | struct ceph_msg *msg; | 1687 | struct ceph_msg *msg; |
1978 | 1688 | ||
1979 | BUG_ON(con->in_msg->con != con); | ||
1980 | con->in_msg->con = NULL; | ||
1981 | msg = con->in_msg; | 1689 | msg = con->in_msg; |
1982 | con->in_msg = NULL; | 1690 | con->in_msg = NULL; |
1983 | con->ops->put(con); | ||
1984 | 1691 | ||
1985 | /* if first message, set peer_name */ | 1692 | /* if first message, set peer_name */ |
1986 | if (con->peer_name.type == 0) | 1693 | if (con->peer_name.type == 0) |
@@ -2000,6 +1707,7 @@ static void process_message(struct ceph_connection *con) | |||
2000 | con->ops->dispatch(con, msg); | 1707 | con->ops->dispatch(con, msg); |
2001 | 1708 | ||
2002 | mutex_lock(&con->mutex); | 1709 | mutex_lock(&con->mutex); |
1710 | prepare_read_tag(con); | ||
2003 | } | 1711 | } |
2004 | 1712 | ||
2005 | 1713 | ||
@@ -2009,29 +1717,32 @@ static void process_message(struct ceph_connection *con) | |||
2009 | */ | 1717 | */ |
2010 | static int try_write(struct ceph_connection *con) | 1718 | static int try_write(struct ceph_connection *con) |
2011 | { | 1719 | { |
1720 | struct ceph_messenger *msgr = con->msgr; | ||
2012 | int ret = 1; | 1721 | int ret = 1; |
2013 | 1722 | ||
2014 | dout("try_write start %p state %lu\n", con, con->state); | 1723 | dout("try_write start %p state %lu nref %d\n", con, con->state, |
1724 | atomic_read(&con->nref)); | ||
2015 | 1725 | ||
2016 | more: | 1726 | more: |
2017 | dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); | 1727 | dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); |
2018 | 1728 | ||
2019 | /* open the socket first? */ | 1729 | /* open the socket first? */ |
2020 | if (con->state == CON_STATE_PREOPEN) { | 1730 | if (con->sock == NULL) { |
2021 | BUG_ON(con->sock); | 1731 | prepare_write_banner(msgr, con); |
2022 | con->state = CON_STATE_CONNECTING; | 1732 | prepare_write_connect(msgr, con, 1); |
2023 | |||
2024 | con_out_kvec_reset(con); | ||
2025 | prepare_write_banner(con); | ||
2026 | prepare_read_banner(con); | 1733 | prepare_read_banner(con); |
1734 | set_bit(CONNECTING, &con->state); | ||
1735 | clear_bit(NEGOTIATING, &con->state); | ||
2027 | 1736 | ||
2028 | BUG_ON(con->in_msg); | 1737 | BUG_ON(con->in_msg); |
2029 | con->in_tag = CEPH_MSGR_TAG_READY; | 1738 | con->in_tag = CEPH_MSGR_TAG_READY; |
2030 | dout("try_write initiating connect on %p new state %lu\n", | 1739 | dout("try_write initiating connect on %p new state %lu\n", |
2031 | con, con->state); | 1740 | con, con->state); |
2032 | ret = ceph_tcp_connect(con); | 1741 | con->sock = ceph_tcp_connect(con); |
2033 | if (ret < 0) { | 1742 | if (IS_ERR(con->sock)) { |
1743 | con->sock = NULL; | ||
2034 | con->error_msg = "connect error"; | 1744 | con->error_msg = "connect error"; |
1745 | ret = -1; | ||
2035 | goto out; | 1746 | goto out; |
2036 | } | 1747 | } |
2037 | } | 1748 | } |
@@ -2070,7 +1781,7 @@ more_kvec: | |||
2070 | } | 1781 | } |
2071 | 1782 | ||
2072 | do_next: | 1783 | do_next: |
2073 | if (con->state == CON_STATE_OPEN) { | 1784 | if (!test_bit(CONNECTING, &con->state)) { |
2074 | /* is anything else pending? */ | 1785 | /* is anything else pending? */ |
2075 | if (!list_empty(&con->out_queue)) { | 1786 | if (!list_empty(&con->out_queue)) { |
2076 | prepare_write_message(con); | 1787 | prepare_write_message(con); |
@@ -2080,15 +1791,14 @@ do_next: | |||
2080 | prepare_write_ack(con); | 1791 | prepare_write_ack(con); |
2081 | goto more; | 1792 | goto more; |
2082 | } | 1793 | } |
2083 | if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, | 1794 | if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { |
2084 | &con->flags)) { | ||
2085 | prepare_write_keepalive(con); | 1795 | prepare_write_keepalive(con); |
2086 | goto more; | 1796 | goto more; |
2087 | } | 1797 | } |
2088 | } | 1798 | } |
2089 | 1799 | ||
2090 | /* Nothing to do! */ | 1800 | /* Nothing to do! */ |
2091 | clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 1801 | clear_bit(WRITE_PENDING, &con->state); |
2092 | dout("try_write nothing else to write.\n"); | 1802 | dout("try_write nothing else to write.\n"); |
2093 | ret = 0; | 1803 | ret = 0; |
2094 | out: | 1804 | out: |
@@ -2105,45 +1815,38 @@ static int try_read(struct ceph_connection *con) | |||
2105 | { | 1815 | { |
2106 | int ret = -1; | 1816 | int ret = -1; |
2107 | 1817 | ||
2108 | more: | 1818 | if (!con->sock) |
2109 | dout("try_read start on %p state %lu\n", con, con->state); | 1819 | return 0; |
2110 | if (con->state != CON_STATE_CONNECTING && | 1820 | |
2111 | con->state != CON_STATE_NEGOTIATING && | 1821 | if (test_bit(STANDBY, &con->state)) |
2112 | con->state != CON_STATE_OPEN) | ||
2113 | return 0; | 1822 | return 0; |
2114 | 1823 | ||
2115 | BUG_ON(!con->sock); | 1824 | dout("try_read start on %p\n", con); |
2116 | 1825 | ||
1826 | more: | ||
2117 | dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, | 1827 | dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, |
2118 | con->in_base_pos); | 1828 | con->in_base_pos); |
2119 | 1829 | ||
2120 | if (con->state == CON_STATE_CONNECTING) { | 1830 | /* |
2121 | dout("try_read connecting\n"); | 1831 | * process_connect and process_message drop and re-take |
2122 | ret = read_partial_banner(con); | 1832 | * con->mutex. make sure we handle a racing close or reopen. |
2123 | if (ret <= 0) | 1833 | */ |
2124 | goto out; | 1834 | if (test_bit(CLOSED, &con->state) || |
2125 | ret = process_banner(con); | 1835 | test_bit(OPENING, &con->state)) { |
2126 | if (ret < 0) | 1836 | ret = -EAGAIN; |
2127 | goto out; | ||
2128 | |||
2129 | con->state = CON_STATE_NEGOTIATING; | ||
2130 | |||
2131 | /* | ||
2132 | * Received banner is good, exchange connection info. | ||
2133 | * Do not reset out_kvec, as sending our banner raced | ||
2134 | * with receiving peer banner after connect completed. | ||
2135 | */ | ||
2136 | ret = prepare_write_connect(con); | ||
2137 | if (ret < 0) | ||
2138 | goto out; | ||
2139 | prepare_read_connect(con); | ||
2140 | |||
2141 | /* Send connection info before awaiting response */ | ||
2142 | goto out; | 1837 | goto out; |
2143 | } | 1838 | } |
2144 | 1839 | ||
2145 | if (con->state == CON_STATE_NEGOTIATING) { | 1840 | if (test_bit(CONNECTING, &con->state)) { |
2146 | dout("try_read negotiating\n"); | 1841 | if (!test_bit(NEGOTIATING, &con->state)) { |
1842 | dout("try_read connecting\n"); | ||
1843 | ret = read_partial_banner(con); | ||
1844 | if (ret <= 0) | ||
1845 | goto out; | ||
1846 | ret = process_banner(con); | ||
1847 | if (ret < 0) | ||
1848 | goto out; | ||
1849 | } | ||
2147 | ret = read_partial_connect(con); | 1850 | ret = read_partial_connect(con); |
2148 | if (ret <= 0) | 1851 | if (ret <= 0) |
2149 | goto out; | 1852 | goto out; |
@@ -2153,17 +1856,14 @@ more: | |||
2153 | goto more; | 1856 | goto more; |
2154 | } | 1857 | } |
2155 | 1858 | ||
2156 | WARN_ON(con->state != CON_STATE_OPEN); | ||
2157 | |||
2158 | if (con->in_base_pos < 0) { | 1859 | if (con->in_base_pos < 0) { |
2159 | /* | 1860 | /* |
2160 | * skipping + discarding content. | 1861 | * skipping + discarding content. |
2161 | * | 1862 | * |
2162 | * FIXME: there must be a better way to do this! | 1863 | * FIXME: there must be a better way to do this! |
2163 | */ | 1864 | */ |
2164 | static char buf[SKIP_BUF_SIZE]; | 1865 | static char buf[1024]; |
2165 | int skip = min((int) sizeof (buf), -con->in_base_pos); | 1866 | int skip = min(1024, -con->in_base_pos); |
2166 | |||
2167 | dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); | 1867 | dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); |
2168 | ret = ceph_tcp_recvmsg(con->sock, buf, skip); | 1868 | ret = ceph_tcp_recvmsg(con->sock, buf, skip); |
2169 | if (ret <= 0) | 1869 | if (ret <= 0) |
@@ -2188,8 +1888,7 @@ more: | |||
2188 | prepare_read_ack(con); | 1888 | prepare_read_ack(con); |
2189 | break; | 1889 | break; |
2190 | case CEPH_MSGR_TAG_CLOSE: | 1890 | case CEPH_MSGR_TAG_CLOSE: |
2191 | con_close_socket(con); | 1891 | set_bit(CLOSED, &con->state); /* fixme */ |
2192 | con->state = CON_STATE_CLOSED; | ||
2193 | goto out; | 1892 | goto out; |
2194 | default: | 1893 | default: |
2195 | goto bad_tag; | 1894 | goto bad_tag; |
@@ -2212,8 +1911,6 @@ more: | |||
2212 | if (con->in_tag == CEPH_MSGR_TAG_READY) | 1911 | if (con->in_tag == CEPH_MSGR_TAG_READY) |
2213 | goto more; | 1912 | goto more; |
2214 | process_message(con); | 1913 | process_message(con); |
2215 | if (con->state == CON_STATE_OPEN) | ||
2216 | prepare_read_tag(con); | ||
2217 | goto more; | 1914 | goto more; |
2218 | } | 1915 | } |
2219 | if (con->in_tag == CEPH_MSGR_TAG_ACK) { | 1916 | if (con->in_tag == CEPH_MSGR_TAG_ACK) { |
@@ -2237,62 +1934,28 @@ bad_tag: | |||
2237 | 1934 | ||
2238 | 1935 | ||
2239 | /* | 1936 | /* |
2240 | * Atomically queue work on a connection after the specified delay. | 1937 | * Atomically queue work on a connection. Bump @con reference to |
2241 | * Bump @con reference to avoid races with connection teardown. | 1938 | * avoid races with connection teardown. |
2242 | * Returns 0 if work was queued, or an error code otherwise. | ||
2243 | */ | 1939 | */ |
2244 | static int queue_con_delay(struct ceph_connection *con, unsigned long delay) | 1940 | static void queue_con(struct ceph_connection *con) |
2245 | { | 1941 | { |
2246 | if (!con->ops->get(con)) { | 1942 | if (test_bit(DEAD, &con->state)) { |
2247 | dout("%s %p ref count 0\n", __func__, con); | 1943 | dout("queue_con %p ignoring: DEAD\n", |
2248 | 1944 | con); | |
2249 | return -ENOENT; | 1945 | return; |
2250 | } | 1946 | } |
2251 | 1947 | ||
2252 | if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { | 1948 | if (!con->ops->get(con)) { |
2253 | dout("%s %p - already queued\n", __func__, con); | 1949 | dout("queue_con %p ref count 0\n", con); |
2254 | con->ops->put(con); | 1950 | return; |
2255 | |||
2256 | return -EBUSY; | ||
2257 | } | 1951 | } |
2258 | 1952 | ||
2259 | dout("%s %p %lu\n", __func__, con, delay); | 1953 | if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) { |
2260 | 1954 | dout("queue_con %p - already queued\n", con); | |
2261 | return 0; | 1955 | con->ops->put(con); |
2262 | } | 1956 | } else { |
2263 | 1957 | dout("queue_con %p\n", con); | |
2264 | static void queue_con(struct ceph_connection *con) | ||
2265 | { | ||
2266 | (void) queue_con_delay(con, 0); | ||
2267 | } | ||
2268 | |||
2269 | static bool con_sock_closed(struct ceph_connection *con) | ||
2270 | { | ||
2271 | if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) | ||
2272 | return false; | ||
2273 | |||
2274 | #define CASE(x) \ | ||
2275 | case CON_STATE_ ## x: \ | ||
2276 | con->error_msg = "socket closed (con state " #x ")"; \ | ||
2277 | break; | ||
2278 | |||
2279 | switch (con->state) { | ||
2280 | CASE(CLOSED); | ||
2281 | CASE(PREOPEN); | ||
2282 | CASE(CONNECTING); | ||
2283 | CASE(NEGOTIATING); | ||
2284 | CASE(OPEN); | ||
2285 | CASE(STANDBY); | ||
2286 | default: | ||
2287 | pr_warning("%s con %p unrecognized state %lu\n", | ||
2288 | __func__, con, con->state); | ||
2289 | con->error_msg = "unrecognized con state"; | ||
2290 | BUG(); | ||
2291 | break; | ||
2292 | } | 1958 | } |
2293 | #undef CASE | ||
2294 | |||
2295 | return true; | ||
2296 | } | 1959 | } |
2297 | 1960 | ||
2298 | /* | 1961 | /* |
@@ -2306,50 +1969,49 @@ static void con_work(struct work_struct *work) | |||
2306 | 1969 | ||
2307 | mutex_lock(&con->mutex); | 1970 | mutex_lock(&con->mutex); |
2308 | restart: | 1971 | restart: |
2309 | if (con_sock_closed(con)) | 1972 | if (test_and_clear_bit(BACKOFF, &con->state)) { |
2310 | goto fault; | ||
2311 | |||
2312 | if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { | ||
2313 | dout("con_work %p backing off\n", con); | 1973 | dout("con_work %p backing off\n", con); |
2314 | ret = queue_con_delay(con, round_jiffies_relative(con->delay)); | 1974 | if (queue_delayed_work(ceph_msgr_wq, &con->work, |
2315 | if (ret) { | 1975 | round_jiffies_relative(con->delay))) { |
1976 | dout("con_work %p backoff %lu\n", con, con->delay); | ||
1977 | mutex_unlock(&con->mutex); | ||
1978 | return; | ||
1979 | } else { | ||
1980 | con->ops->put(con); | ||
2316 | dout("con_work %p FAILED to back off %lu\n", con, | 1981 | dout("con_work %p FAILED to back off %lu\n", con, |
2317 | con->delay); | 1982 | con->delay); |
2318 | BUG_ON(ret == -ENOENT); | ||
2319 | set_bit(CON_FLAG_BACKOFF, &con->flags); | ||
2320 | } | 1983 | } |
2321 | goto done; | ||
2322 | } | 1984 | } |
2323 | 1985 | ||
2324 | if (con->state == CON_STATE_STANDBY) { | 1986 | if (test_bit(STANDBY, &con->state)) { |
2325 | dout("con_work %p STANDBY\n", con); | 1987 | dout("con_work %p STANDBY\n", con); |
2326 | goto done; | 1988 | goto done; |
2327 | } | 1989 | } |
2328 | if (con->state == CON_STATE_CLOSED) { | 1990 | if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ |
2329 | dout("con_work %p CLOSED\n", con); | 1991 | dout("con_work CLOSED\n"); |
2330 | BUG_ON(con->sock); | 1992 | con_close_socket(con); |
2331 | goto done; | 1993 | goto done; |
2332 | } | 1994 | } |
2333 | if (con->state == CON_STATE_PREOPEN) { | 1995 | if (test_and_clear_bit(OPENING, &con->state)) { |
1996 | /* reopen w/ new peer */ | ||
2334 | dout("con_work OPENING\n"); | 1997 | dout("con_work OPENING\n"); |
2335 | BUG_ON(con->sock); | 1998 | con_close_socket(con); |
2336 | } | 1999 | } |
2337 | 2000 | ||
2001 | if (test_and_clear_bit(SOCK_CLOSED, &con->state)) | ||
2002 | goto fault; | ||
2003 | |||
2338 | ret = try_read(con); | 2004 | ret = try_read(con); |
2339 | if (ret == -EAGAIN) | 2005 | if (ret == -EAGAIN) |
2340 | goto restart; | 2006 | goto restart; |
2341 | if (ret < 0) { | 2007 | if (ret < 0) |
2342 | con->error_msg = "socket error on read"; | ||
2343 | goto fault; | 2008 | goto fault; |
2344 | } | ||
2345 | 2009 | ||
2346 | ret = try_write(con); | 2010 | ret = try_write(con); |
2347 | if (ret == -EAGAIN) | 2011 | if (ret == -EAGAIN) |
2348 | goto restart; | 2012 | goto restart; |
2349 | if (ret < 0) { | 2013 | if (ret < 0) |
2350 | con->error_msg = "socket error on write"; | ||
2351 | goto fault; | 2014 | goto fault; |
2352 | } | ||
2353 | 2015 | ||
2354 | done: | 2016 | done: |
2355 | mutex_unlock(&con->mutex); | 2017 | mutex_unlock(&con->mutex); |
@@ -2358,6 +2020,7 @@ done_unlocked: | |||
2358 | return; | 2020 | return; |
2359 | 2021 | ||
2360 | fault: | 2022 | fault: |
2023 | mutex_unlock(&con->mutex); | ||
2361 | ceph_fault(con); /* error/fault path */ | 2024 | ceph_fault(con); /* error/fault path */ |
2362 | goto done_unlocked; | 2025 | goto done_unlocked; |
2363 | } | 2026 | } |
@@ -2368,31 +2031,26 @@ fault: | |||
2368 | * exponential backoff | 2031 | * exponential backoff |
2369 | */ | 2032 | */ |
2370 | static void ceph_fault(struct ceph_connection *con) | 2033 | static void ceph_fault(struct ceph_connection *con) |
2371 | __releases(con->mutex) | ||
2372 | { | 2034 | { |
2373 | pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), | 2035 | pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), |
2374 | ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); | 2036 | ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); |
2375 | dout("fault %p state %lu to peer %s\n", | 2037 | dout("fault %p state %lu to peer %s\n", |
2376 | con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); | 2038 | con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); |
2377 | 2039 | ||
2378 | WARN_ON(con->state != CON_STATE_CONNECTING && | 2040 | if (test_bit(LOSSYTX, &con->state)) { |
2379 | con->state != CON_STATE_NEGOTIATING && | 2041 | dout("fault on LOSSYTX channel\n"); |
2380 | con->state != CON_STATE_OPEN); | 2042 | goto out; |
2381 | 2043 | } | |
2382 | con_close_socket(con); | ||
2383 | 2044 | ||
2384 | if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { | 2045 | mutex_lock(&con->mutex); |
2385 | dout("fault on LOSSYTX channel, marking CLOSED\n"); | 2046 | if (test_bit(CLOSED, &con->state)) |
2386 | con->state = CON_STATE_CLOSED; | ||
2387 | goto out_unlock; | 2047 | goto out_unlock; |
2388 | } | 2048 | |
2049 | con_close_socket(con); | ||
2389 | 2050 | ||
2390 | if (con->in_msg) { | 2051 | if (con->in_msg) { |
2391 | BUG_ON(con->in_msg->con != con); | ||
2392 | con->in_msg->con = NULL; | ||
2393 | ceph_msg_put(con->in_msg); | 2052 | ceph_msg_put(con->in_msg); |
2394 | con->in_msg = NULL; | 2053 | con->in_msg = NULL; |
2395 | con->ops->put(con); | ||
2396 | } | 2054 | } |
2397 | 2055 | ||
2398 | /* Requeue anything that hasn't been acked */ | 2056 | /* Requeue anything that hasn't been acked */ |
@@ -2401,23 +2059,39 @@ static void ceph_fault(struct ceph_connection *con) | |||
2401 | /* If there are no messages queued or keepalive pending, place | 2059 | /* If there are no messages queued or keepalive pending, place |
2402 | * the connection in a STANDBY state */ | 2060 | * the connection in a STANDBY state */ |
2403 | if (list_empty(&con->out_queue) && | 2061 | if (list_empty(&con->out_queue) && |
2404 | !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { | 2062 | !test_bit(KEEPALIVE_PENDING, &con->state)) { |
2405 | dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); | 2063 | dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); |
2406 | clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 2064 | clear_bit(WRITE_PENDING, &con->state); |
2407 | con->state = CON_STATE_STANDBY; | 2065 | set_bit(STANDBY, &con->state); |
2408 | } else { | 2066 | } else { |
2409 | /* retry after a delay. */ | 2067 | /* retry after a delay. */ |
2410 | con->state = CON_STATE_PREOPEN; | ||
2411 | if (con->delay == 0) | 2068 | if (con->delay == 0) |
2412 | con->delay = BASE_DELAY_INTERVAL; | 2069 | con->delay = BASE_DELAY_INTERVAL; |
2413 | else if (con->delay < MAX_DELAY_INTERVAL) | 2070 | else if (con->delay < MAX_DELAY_INTERVAL) |
2414 | con->delay *= 2; | 2071 | con->delay *= 2; |
2415 | set_bit(CON_FLAG_BACKOFF, &con->flags); | 2072 | con->ops->get(con); |
2416 | queue_con(con); | 2073 | if (queue_delayed_work(ceph_msgr_wq, &con->work, |
2074 | round_jiffies_relative(con->delay))) { | ||
2075 | dout("fault queued %p delay %lu\n", con, con->delay); | ||
2076 | } else { | ||
2077 | con->ops->put(con); | ||
2078 | dout("fault failed to queue %p delay %lu, backoff\n", | ||
2079 | con, con->delay); | ||
2080 | /* | ||
2081 | * In many cases we see a socket state change | ||
2082 | * while con_work is running and end up | ||
2083 | * queuing (non-delayed) work, such that we | ||
2084 | * can't backoff with a delay. Set a flag so | ||
2085 | * that when con_work restarts we schedule the | ||
2086 | * delay then. | ||
2087 | */ | ||
2088 | set_bit(BACKOFF, &con->state); | ||
2089 | } | ||
2417 | } | 2090 | } |
2418 | 2091 | ||
2419 | out_unlock: | 2092 | out_unlock: |
2420 | mutex_unlock(&con->mutex); | 2093 | mutex_unlock(&con->mutex); |
2094 | out: | ||
2421 | /* | 2095 | /* |
2422 | * in case we faulted due to authentication, invalidate our | 2096 | * in case we faulted due to authentication, invalidate our |
2423 | * current tickets so that we can get new ones. | 2097 | * current tickets so that we can get new ones. |
@@ -2434,19 +2108,32 @@ out_unlock: | |||
2434 | 2108 | ||
2435 | 2109 | ||
2436 | /* | 2110 | /* |
2437 | * initialize a new messenger instance | 2111 | * create a new messenger instance |
2438 | */ | 2112 | */ |
2439 | void ceph_messenger_init(struct ceph_messenger *msgr, | 2113 | struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, |
2440 | struct ceph_entity_addr *myaddr, | 2114 | u32 supported_features, |
2441 | u32 supported_features, | 2115 | u32 required_features) |
2442 | u32 required_features, | ||
2443 | bool nocrc) | ||
2444 | { | 2116 | { |
2117 | struct ceph_messenger *msgr; | ||
2118 | |||
2119 | msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); | ||
2120 | if (msgr == NULL) | ||
2121 | return ERR_PTR(-ENOMEM); | ||
2122 | |||
2445 | msgr->supported_features = supported_features; | 2123 | msgr->supported_features = supported_features; |
2446 | msgr->required_features = required_features; | 2124 | msgr->required_features = required_features; |
2447 | 2125 | ||
2448 | spin_lock_init(&msgr->global_seq_lock); | 2126 | spin_lock_init(&msgr->global_seq_lock); |
2449 | 2127 | ||
2128 | /* the zero page is needed if a request is "canceled" while the message | ||
2129 | * is being written over the socket */ | ||
2130 | msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); | ||
2131 | if (!msgr->zero_page) { | ||
2132 | kfree(msgr); | ||
2133 | return ERR_PTR(-ENOMEM); | ||
2134 | } | ||
2135 | kmap(msgr->zero_page); | ||
2136 | |||
2450 | if (myaddr) | 2137 | if (myaddr) |
2451 | msgr->inst.addr = *myaddr; | 2138 | msgr->inst.addr = *myaddr; |
2452 | 2139 | ||
@@ -2454,23 +2141,32 @@ void ceph_messenger_init(struct ceph_messenger *msgr, | |||
2454 | msgr->inst.addr.type = 0; | 2141 | msgr->inst.addr.type = 0; |
2455 | get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); | 2142 | get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); |
2456 | encode_my_addr(msgr); | 2143 | encode_my_addr(msgr); |
2457 | msgr->nocrc = nocrc; | ||
2458 | 2144 | ||
2459 | atomic_set(&msgr->stopping, 0); | 2145 | dout("messenger_create %p\n", msgr); |
2146 | return msgr; | ||
2147 | } | ||
2148 | EXPORT_SYMBOL(ceph_messenger_create); | ||
2460 | 2149 | ||
2461 | dout("%s %p\n", __func__, msgr); | 2150 | void ceph_messenger_destroy(struct ceph_messenger *msgr) |
2151 | { | ||
2152 | dout("destroy %p\n", msgr); | ||
2153 | kunmap(msgr->zero_page); | ||
2154 | __free_page(msgr->zero_page); | ||
2155 | kfree(msgr); | ||
2156 | dout("destroyed messenger %p\n", msgr); | ||
2462 | } | 2157 | } |
2463 | EXPORT_SYMBOL(ceph_messenger_init); | 2158 | EXPORT_SYMBOL(ceph_messenger_destroy); |
2464 | 2159 | ||
2465 | static void clear_standby(struct ceph_connection *con) | 2160 | static void clear_standby(struct ceph_connection *con) |
2466 | { | 2161 | { |
2467 | /* come back from STANDBY? */ | 2162 | /* come back from STANDBY? */ |
2468 | if (con->state == CON_STATE_STANDBY) { | 2163 | if (test_and_clear_bit(STANDBY, &con->state)) { |
2164 | mutex_lock(&con->mutex); | ||
2469 | dout("clear_standby %p and ++connect_seq\n", con); | 2165 | dout("clear_standby %p and ++connect_seq\n", con); |
2470 | con->state = CON_STATE_PREOPEN; | ||
2471 | con->connect_seq++; | 2166 | con->connect_seq++; |
2472 | WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); | 2167 | WARN_ON(test_bit(WRITE_PENDING, &con->state)); |
2473 | WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); | 2168 | WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); |
2169 | mutex_unlock(&con->mutex); | ||
2474 | } | 2170 | } |
2475 | } | 2171 | } |
2476 | 2172 | ||
@@ -2479,24 +2175,21 @@ static void clear_standby(struct ceph_connection *con) | |||
2479 | */ | 2175 | */ |
2480 | void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) | 2176 | void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) |
2481 | { | 2177 | { |
2482 | /* set src+dst */ | 2178 | if (test_bit(CLOSED, &con->state)) { |
2483 | msg->hdr.src = con->msgr->inst.name; | ||
2484 | BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); | ||
2485 | msg->needs_out_seq = true; | ||
2486 | |||
2487 | mutex_lock(&con->mutex); | ||
2488 | |||
2489 | if (con->state == CON_STATE_CLOSED) { | ||
2490 | dout("con_send %p closed, dropping %p\n", con, msg); | 2179 | dout("con_send %p closed, dropping %p\n", con, msg); |
2491 | ceph_msg_put(msg); | 2180 | ceph_msg_put(msg); |
2492 | mutex_unlock(&con->mutex); | ||
2493 | return; | 2181 | return; |
2494 | } | 2182 | } |
2495 | 2183 | ||
2496 | BUG_ON(msg->con != NULL); | 2184 | /* set src+dst */ |
2497 | msg->con = con->ops->get(con); | 2185 | msg->hdr.src = con->msgr->inst.name; |
2498 | BUG_ON(msg->con == NULL); | 2186 | |
2187 | BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); | ||
2499 | 2188 | ||
2189 | msg->needs_out_seq = true; | ||
2190 | |||
2191 | /* queue */ | ||
2192 | mutex_lock(&con->mutex); | ||
2500 | BUG_ON(!list_empty(&msg->list_head)); | 2193 | BUG_ON(!list_empty(&msg->list_head)); |
2501 | list_add_tail(&msg->list_head, &con->out_queue); | 2194 | list_add_tail(&msg->list_head, &con->out_queue); |
2502 | dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, | 2195 | dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, |
@@ -2505,13 +2198,12 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) | |||
2505 | le32_to_cpu(msg->hdr.front_len), | 2198 | le32_to_cpu(msg->hdr.front_len), |
2506 | le32_to_cpu(msg->hdr.middle_len), | 2199 | le32_to_cpu(msg->hdr.middle_len), |
2507 | le32_to_cpu(msg->hdr.data_len)); | 2200 | le32_to_cpu(msg->hdr.data_len)); |
2508 | |||
2509 | clear_standby(con); | ||
2510 | mutex_unlock(&con->mutex); | 2201 | mutex_unlock(&con->mutex); |
2511 | 2202 | ||
2512 | /* if there wasn't anything waiting to send before, queue | 2203 | /* if there wasn't anything waiting to send before, queue |
2513 | * new work */ | 2204 | * new work */ |
2514 | if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) | 2205 | clear_standby(con); |
2206 | if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) | ||
2515 | queue_con(con); | 2207 | queue_con(con); |
2516 | } | 2208 | } |
2517 | EXPORT_SYMBOL(ceph_con_send); | 2209 | EXPORT_SYMBOL(ceph_con_send); |
@@ -2519,34 +2211,24 @@ EXPORT_SYMBOL(ceph_con_send); | |||
2519 | /* | 2211 | /* |
2520 | * Revoke a message that was previously queued for send | 2212 | * Revoke a message that was previously queued for send |
2521 | */ | 2213 | */ |
2522 | void ceph_msg_revoke(struct ceph_msg *msg) | 2214 | void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) |
2523 | { | 2215 | { |
2524 | struct ceph_connection *con = msg->con; | ||
2525 | |||
2526 | if (!con) | ||
2527 | return; /* Message not in our possession */ | ||
2528 | |||
2529 | mutex_lock(&con->mutex); | 2216 | mutex_lock(&con->mutex); |
2530 | if (!list_empty(&msg->list_head)) { | 2217 | if (!list_empty(&msg->list_head)) { |
2531 | dout("%s %p msg %p - was on queue\n", __func__, con, msg); | 2218 | dout("con_revoke %p msg %p - was on queue\n", con, msg); |
2532 | list_del_init(&msg->list_head); | 2219 | list_del_init(&msg->list_head); |
2533 | BUG_ON(msg->con == NULL); | ||
2534 | msg->con->ops->put(msg->con); | ||
2535 | msg->con = NULL; | ||
2536 | msg->hdr.seq = 0; | ||
2537 | |||
2538 | ceph_msg_put(msg); | 2220 | ceph_msg_put(msg); |
2221 | msg->hdr.seq = 0; | ||
2539 | } | 2222 | } |
2540 | if (con->out_msg == msg) { | 2223 | if (con->out_msg == msg) { |
2541 | dout("%s %p msg %p - was sending\n", __func__, con, msg); | 2224 | dout("con_revoke %p msg %p - was sending\n", con, msg); |
2542 | con->out_msg = NULL; | 2225 | con->out_msg = NULL; |
2543 | if (con->out_kvec_is_msg) { | 2226 | if (con->out_kvec_is_msg) { |
2544 | con->out_skip = con->out_kvec_bytes; | 2227 | con->out_skip = con->out_kvec_bytes; |
2545 | con->out_kvec_is_msg = false; | 2228 | con->out_kvec_is_msg = false; |
2546 | } | 2229 | } |
2547 | msg->hdr.seq = 0; | ||
2548 | |||
2549 | ceph_msg_put(msg); | 2230 | ceph_msg_put(msg); |
2231 | msg->hdr.seq = 0; | ||
2550 | } | 2232 | } |
2551 | mutex_unlock(&con->mutex); | 2233 | mutex_unlock(&con->mutex); |
2552 | } | 2234 | } |
@@ -2554,27 +2236,17 @@ void ceph_msg_revoke(struct ceph_msg *msg) | |||
2554 | /* | 2236 | /* |
2555 | * Revoke a message that we may be reading data into | 2237 | * Revoke a message that we may be reading data into |
2556 | */ | 2238 | */ |
2557 | void ceph_msg_revoke_incoming(struct ceph_msg *msg) | 2239 | void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) |
2558 | { | 2240 | { |
2559 | struct ceph_connection *con; | ||
2560 | |||
2561 | BUG_ON(msg == NULL); | ||
2562 | if (!msg->con) { | ||
2563 | dout("%s msg %p null con\n", __func__, msg); | ||
2564 | |||
2565 | return; /* Message not in our possession */ | ||
2566 | } | ||
2567 | |||
2568 | con = msg->con; | ||
2569 | mutex_lock(&con->mutex); | 2241 | mutex_lock(&con->mutex); |
2570 | if (con->in_msg == msg) { | 2242 | if (con->in_msg && con->in_msg == msg) { |
2571 | unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); | 2243 | unsigned front_len = le32_to_cpu(con->in_hdr.front_len); |
2572 | unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); | 2244 | unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); |
2573 | unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); | 2245 | unsigned data_len = le32_to_cpu(con->in_hdr.data_len); |
2574 | 2246 | ||
2575 | /* skip rest of message */ | 2247 | /* skip rest of message */ |
2576 | dout("%s %p msg %p revoked\n", __func__, con, msg); | 2248 | dout("con_revoke_pages %p msg %p revoked\n", con, msg); |
2577 | con->in_base_pos = con->in_base_pos - | 2249 | con->in_base_pos = con->in_base_pos - |
2578 | sizeof(struct ceph_msg_header) - | 2250 | sizeof(struct ceph_msg_header) - |
2579 | front_len - | 2251 | front_len - |
2580 | middle_len - | 2252 | middle_len - |
@@ -2585,8 +2257,8 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) | |||
2585 | con->in_tag = CEPH_MSGR_TAG_READY; | 2257 | con->in_tag = CEPH_MSGR_TAG_READY; |
2586 | con->in_seq++; | 2258 | con->in_seq++; |
2587 | } else { | 2259 | } else { |
2588 | dout("%s %p in_msg %p msg %p no-op\n", | 2260 | dout("con_revoke_pages %p msg %p pages %p no-op\n", |
2589 | __func__, con, con->in_msg, msg); | 2261 | con, con->in_msg, msg); |
2590 | } | 2262 | } |
2591 | mutex_unlock(&con->mutex); | 2263 | mutex_unlock(&con->mutex); |
2592 | } | 2264 | } |
@@ -2597,11 +2269,9 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) | |||
2597 | void ceph_con_keepalive(struct ceph_connection *con) | 2269 | void ceph_con_keepalive(struct ceph_connection *con) |
2598 | { | 2270 | { |
2599 | dout("con_keepalive %p\n", con); | 2271 | dout("con_keepalive %p\n", con); |
2600 | mutex_lock(&con->mutex); | ||
2601 | clear_standby(con); | 2272 | clear_standby(con); |
2602 | mutex_unlock(&con->mutex); | 2273 | if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && |
2603 | if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && | 2274 | test_and_set_bit(WRITE_PENDING, &con->state) == 0) |
2604 | test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) | ||
2605 | queue_con(con); | 2275 | queue_con(con); |
2606 | } | 2276 | } |
2607 | EXPORT_SYMBOL(ceph_con_keepalive); | 2277 | EXPORT_SYMBOL(ceph_con_keepalive); |
@@ -2611,8 +2281,7 @@ EXPORT_SYMBOL(ceph_con_keepalive); | |||
2611 | * construct a new message with given type, size | 2281 | * construct a new message with given type, size |
2612 | * the new msg has a ref count of 1. | 2282 | * the new msg has a ref count of 1. |
2613 | */ | 2283 | */ |
2614 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | 2284 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) |
2615 | bool can_fail) | ||
2616 | { | 2285 | { |
2617 | struct ceph_msg *m; | 2286 | struct ceph_msg *m; |
2618 | 2287 | ||
@@ -2620,8 +2289,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |||
2620 | if (m == NULL) | 2289 | if (m == NULL) |
2621 | goto out; | 2290 | goto out; |
2622 | kref_init(&m->kref); | 2291 | kref_init(&m->kref); |
2623 | |||
2624 | m->con = NULL; | ||
2625 | INIT_LIST_HEAD(&m->list_head); | 2292 | INIT_LIST_HEAD(&m->list_head); |
2626 | 2293 | ||
2627 | m->hdr.tid = 0; | 2294 | m->hdr.tid = 0; |
@@ -2666,7 +2333,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |||
2666 | m->front.iov_base = kmalloc(front_len, flags); | 2333 | m->front.iov_base = kmalloc(front_len, flags); |
2667 | } | 2334 | } |
2668 | if (m->front.iov_base == NULL) { | 2335 | if (m->front.iov_base == NULL) { |
2669 | dout("ceph_msg_new can't allocate %d bytes\n", | 2336 | pr_err("msg_new can't allocate %d bytes\n", |
2670 | front_len); | 2337 | front_len); |
2671 | goto out2; | 2338 | goto out2; |
2672 | } | 2339 | } |
@@ -2681,14 +2348,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |||
2681 | out2: | 2348 | out2: |
2682 | ceph_msg_put(m); | 2349 | ceph_msg_put(m); |
2683 | out: | 2350 | out: |
2684 | if (!can_fail) { | 2351 | pr_err("msg_new can't create type %d front %d\n", type, front_len); |
2685 | pr_err("msg_new can't create type %d front %d\n", type, | ||
2686 | front_len); | ||
2687 | WARN_ON(1); | ||
2688 | } else { | ||
2689 | dout("msg_new can't create type %d front %d\n", type, | ||
2690 | front_len); | ||
2691 | } | ||
2692 | return NULL; | 2352 | return NULL; |
2693 | } | 2353 | } |
2694 | EXPORT_SYMBOL(ceph_msg_new); | 2354 | EXPORT_SYMBOL(ceph_msg_new); |
@@ -2717,78 +2377,46 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) | |||
2717 | } | 2377 | } |
2718 | 2378 | ||
2719 | /* | 2379 | /* |
2720 | * Allocate a message for receiving an incoming message on a | 2380 | * Generic message allocator, for incoming messages. |
2721 | * connection, and save the result in con->in_msg. Uses the | ||
2722 | * connection's private alloc_msg op if available. | ||
2723 | * | ||
2724 | * Returns 0 on success, or a negative error code. | ||
2725 | * | ||
2726 | * On success, if we set *skip = 1: | ||
2727 | * - the next message should be skipped and ignored. | ||
2728 | * - con->in_msg == NULL | ||
2729 | * or if we set *skip = 0: | ||
2730 | * - con->in_msg is non-null. | ||
2731 | * On error (ENOMEM, EAGAIN, ...), | ||
2732 | * - con->in_msg == NULL | ||
2733 | */ | 2381 | */ |
2734 | static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) | 2382 | static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, |
2383 | struct ceph_msg_header *hdr, | ||
2384 | int *skip) | ||
2735 | { | 2385 | { |
2736 | struct ceph_msg_header *hdr = &con->in_hdr; | ||
2737 | int type = le16_to_cpu(hdr->type); | 2386 | int type = le16_to_cpu(hdr->type); |
2738 | int front_len = le32_to_cpu(hdr->front_len); | 2387 | int front_len = le32_to_cpu(hdr->front_len); |
2739 | int middle_len = le32_to_cpu(hdr->middle_len); | 2388 | int middle_len = le32_to_cpu(hdr->middle_len); |
2740 | int ret = 0; | 2389 | struct ceph_msg *msg = NULL; |
2741 | 2390 | int ret; | |
2742 | BUG_ON(con->in_msg != NULL); | ||
2743 | 2391 | ||
2744 | if (con->ops->alloc_msg) { | 2392 | if (con->ops->alloc_msg) { |
2745 | struct ceph_msg *msg; | ||
2746 | |||
2747 | mutex_unlock(&con->mutex); | 2393 | mutex_unlock(&con->mutex); |
2748 | msg = con->ops->alloc_msg(con, hdr, skip); | 2394 | msg = con->ops->alloc_msg(con, hdr, skip); |
2749 | mutex_lock(&con->mutex); | 2395 | mutex_lock(&con->mutex); |
2750 | if (con->state != CON_STATE_OPEN) { | 2396 | if (!msg || *skip) |
2751 | if (msg) | 2397 | return NULL; |
2752 | ceph_msg_put(msg); | ||
2753 | return -EAGAIN; | ||
2754 | } | ||
2755 | con->in_msg = msg; | ||
2756 | if (con->in_msg) { | ||
2757 | con->in_msg->con = con->ops->get(con); | ||
2758 | BUG_ON(con->in_msg->con == NULL); | ||
2759 | } | ||
2760 | if (*skip) { | ||
2761 | con->in_msg = NULL; | ||
2762 | return 0; | ||
2763 | } | ||
2764 | if (!con->in_msg) { | ||
2765 | con->error_msg = | ||
2766 | "error allocating memory for incoming message"; | ||
2767 | return -ENOMEM; | ||
2768 | } | ||
2769 | } | 2398 | } |
2770 | if (!con->in_msg) { | 2399 | if (!msg) { |
2771 | con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); | 2400 | *skip = 0; |
2772 | if (!con->in_msg) { | 2401 | msg = ceph_msg_new(type, front_len, GFP_NOFS); |
2402 | if (!msg) { | ||
2773 | pr_err("unable to allocate msg type %d len %d\n", | 2403 | pr_err("unable to allocate msg type %d len %d\n", |
2774 | type, front_len); | 2404 | type, front_len); |
2775 | return -ENOMEM; | 2405 | return NULL; |
2776 | } | 2406 | } |
2777 | con->in_msg->con = con->ops->get(con); | 2407 | msg->page_alignment = le16_to_cpu(hdr->data_off); |
2778 | BUG_ON(con->in_msg->con == NULL); | ||
2779 | con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); | ||
2780 | } | 2408 | } |
2781 | memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); | 2409 | memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); |
2782 | 2410 | ||
2783 | if (middle_len && !con->in_msg->middle) { | 2411 | if (middle_len && !msg->middle) { |
2784 | ret = ceph_alloc_middle(con, con->in_msg); | 2412 | ret = ceph_alloc_middle(con, msg); |
2785 | if (ret < 0) { | 2413 | if (ret < 0) { |
2786 | ceph_msg_put(con->in_msg); | 2414 | ceph_msg_put(msg); |
2787 | con->in_msg = NULL; | 2415 | return NULL; |
2788 | } | 2416 | } |
2789 | } | 2417 | } |
2790 | 2418 | ||
2791 | return ret; | 2419 | return msg; |
2792 | } | 2420 | } |
2793 | 2421 | ||
2794 | 2422 | ||
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 812eb3b46c1..cbe31fa4550 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c | |||
@@ -8,8 +8,8 @@ | |||
8 | 8 | ||
9 | #include <linux/ceph/mon_client.h> | 9 | #include <linux/ceph/mon_client.h> |
10 | #include <linux/ceph/libceph.h> | 10 | #include <linux/ceph/libceph.h> |
11 | #include <linux/ceph/debugfs.h> | ||
12 | #include <linux/ceph/decode.h> | 11 | #include <linux/ceph/decode.h> |
12 | |||
13 | #include <linux/ceph/auth.h> | 13 | #include <linux/ceph/auth.h> |
14 | 14 | ||
15 | /* | 15 | /* |
@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) | |||
106 | monc->pending_auth = 1; | 106 | monc->pending_auth = 1; |
107 | monc->m_auth->front.iov_len = len; | 107 | monc->m_auth->front.iov_len = len; |
108 | monc->m_auth->hdr.front_len = cpu_to_le32(len); | 108 | monc->m_auth->hdr.front_len = cpu_to_le32(len); |
109 | ceph_msg_revoke(monc->m_auth); | 109 | ceph_con_revoke(monc->con, monc->m_auth); |
110 | ceph_msg_get(monc->m_auth); /* keep our ref */ | 110 | ceph_msg_get(monc->m_auth); /* keep our ref */ |
111 | ceph_con_send(&monc->con, monc->m_auth); | 111 | ceph_con_send(monc->con, monc->m_auth); |
112 | } | 112 | } |
113 | 113 | ||
114 | /* | 114 | /* |
@@ -116,15 +116,14 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) | |||
116 | */ | 116 | */ |
117 | static void __close_session(struct ceph_mon_client *monc) | 117 | static void __close_session(struct ceph_mon_client *monc) |
118 | { | 118 | { |
119 | dout("__close_session closing mon%d\n", monc->cur_mon); | 119 | if (monc->con) { |
120 | ceph_msg_revoke(monc->m_auth); | 120 | dout("__close_session closing mon%d\n", monc->cur_mon); |
121 | ceph_msg_revoke_incoming(monc->m_auth_reply); | 121 | ceph_con_revoke(monc->con, monc->m_auth); |
122 | ceph_msg_revoke(monc->m_subscribe); | 122 | ceph_con_close(monc->con); |
123 | ceph_msg_revoke_incoming(monc->m_subscribe_ack); | 123 | monc->cur_mon = -1; |
124 | ceph_con_close(&monc->con); | 124 | monc->pending_auth = 0; |
125 | monc->cur_mon = -1; | 125 | ceph_auth_reset(monc->auth); |
126 | monc->pending_auth = 0; | 126 | } |
127 | ceph_auth_reset(monc->auth); | ||
128 | } | 127 | } |
129 | 128 | ||
130 | /* | 129 | /* |
@@ -145,8 +144,9 @@ static int __open_session(struct ceph_mon_client *monc) | |||
145 | monc->want_next_osdmap = !!monc->want_next_osdmap; | 144 | monc->want_next_osdmap = !!monc->want_next_osdmap; |
146 | 145 | ||
147 | dout("open_session mon%d opening\n", monc->cur_mon); | 146 | dout("open_session mon%d opening\n", monc->cur_mon); |
148 | ceph_con_open(&monc->con, | 147 | monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; |
149 | CEPH_ENTITY_TYPE_MON, monc->cur_mon, | 148 | monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); |
149 | ceph_con_open(monc->con, | ||
150 | &monc->monmap->mon_inst[monc->cur_mon].addr); | 150 | &monc->monmap->mon_inst[monc->cur_mon].addr); |
151 | 151 | ||
152 | /* initiatiate authentication handshake */ | 152 | /* initiatiate authentication handshake */ |
@@ -170,7 +170,7 @@ static bool __sub_expired(struct ceph_mon_client *monc) | |||
170 | */ | 170 | */ |
171 | static void __schedule_delayed(struct ceph_mon_client *monc) | 171 | static void __schedule_delayed(struct ceph_mon_client *monc) |
172 | { | 172 | { |
173 | unsigned int delay; | 173 | unsigned delay; |
174 | 174 | ||
175 | if (monc->cur_mon < 0 || __sub_expired(monc)) | 175 | if (monc->cur_mon < 0 || __sub_expired(monc)) |
176 | delay = 10 * HZ; | 176 | delay = 10 * HZ; |
@@ -186,7 +186,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc) | |||
186 | static void __send_subscribe(struct ceph_mon_client *monc) | 186 | static void __send_subscribe(struct ceph_mon_client *monc) |
187 | { | 187 | { |
188 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", | 188 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", |
189 | (unsigned int)monc->sub_sent, __sub_expired(monc), | 189 | (unsigned)monc->sub_sent, __sub_expired(monc), |
190 | monc->want_next_osdmap); | 190 | monc->want_next_osdmap); |
191 | if ((__sub_expired(monc) && !monc->sub_sent) || | 191 | if ((__sub_expired(monc) && !monc->sub_sent) || |
192 | monc->want_next_osdmap == 1) { | 192 | monc->want_next_osdmap == 1) { |
@@ -203,7 +203,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) | |||
203 | 203 | ||
204 | if (monc->want_next_osdmap) { | 204 | if (monc->want_next_osdmap) { |
205 | dout("__send_subscribe to 'osdmap' %u\n", | 205 | dout("__send_subscribe to 'osdmap' %u\n", |
206 | (unsigned int)monc->have_osdmap); | 206 | (unsigned)monc->have_osdmap); |
207 | ceph_encode_string(&p, end, "osdmap", 6); | 207 | ceph_encode_string(&p, end, "osdmap", 6); |
208 | i = p; | 208 | i = p; |
209 | i->have = cpu_to_le64(monc->have_osdmap); | 209 | i->have = cpu_to_le64(monc->have_osdmap); |
@@ -213,7 +213,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) | |||
213 | } | 213 | } |
214 | if (monc->want_mdsmap) { | 214 | if (monc->want_mdsmap) { |
215 | dout("__send_subscribe to 'mdsmap' %u+\n", | 215 | dout("__send_subscribe to 'mdsmap' %u+\n", |
216 | (unsigned int)monc->have_mdsmap); | 216 | (unsigned)monc->have_mdsmap); |
217 | ceph_encode_string(&p, end, "mdsmap", 6); | 217 | ceph_encode_string(&p, end, "mdsmap", 6); |
218 | i = p; | 218 | i = p; |
219 | i->have = cpu_to_le64(monc->have_mdsmap); | 219 | i->have = cpu_to_le64(monc->have_mdsmap); |
@@ -228,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) | |||
228 | 228 | ||
229 | msg->front.iov_len = p - msg->front.iov_base; | 229 | msg->front.iov_len = p - msg->front.iov_base; |
230 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | 230 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
231 | ceph_msg_revoke(msg); | 231 | ceph_con_revoke(monc->con, msg); |
232 | ceph_con_send(&monc->con, ceph_msg_get(msg)); | 232 | ceph_con_send(monc->con, ceph_msg_get(msg)); |
233 | 233 | ||
234 | monc->sub_sent = jiffies | 1; /* never 0 */ | 234 | monc->sub_sent = jiffies | 1; /* never 0 */ |
235 | } | 235 | } |
@@ -238,7 +238,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) | |||
238 | static void handle_subscribe_ack(struct ceph_mon_client *monc, | 238 | static void handle_subscribe_ack(struct ceph_mon_client *monc, |
239 | struct ceph_msg *msg) | 239 | struct ceph_msg *msg) |
240 | { | 240 | { |
241 | unsigned int seconds; | 241 | unsigned seconds; |
242 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; | 242 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; |
243 | 243 | ||
244 | if (msg->front.iov_len < sizeof(*h)) | 244 | if (msg->front.iov_len < sizeof(*h)) |
@@ -249,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, | |||
249 | if (monc->hunting) { | 249 | if (monc->hunting) { |
250 | pr_info("mon%d %s session established\n", | 250 | pr_info("mon%d %s session established\n", |
251 | monc->cur_mon, | 251 | monc->cur_mon, |
252 | ceph_pr_addr(&monc->con.peer_addr.in_addr)); | 252 | ceph_pr_addr(&monc->con->peer_addr.in_addr)); |
253 | monc->hunting = false; | 253 | monc->hunting = false; |
254 | } | 254 | } |
255 | dout("handle_subscribe_ack after %d seconds\n", seconds); | 255 | dout("handle_subscribe_ack after %d seconds\n", seconds); |
@@ -302,6 +302,15 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) | |||
302 | */ | 302 | */ |
303 | int ceph_monc_open_session(struct ceph_mon_client *monc) | 303 | int ceph_monc_open_session(struct ceph_mon_client *monc) |
304 | { | 304 | { |
305 | if (!monc->con) { | ||
306 | monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); | ||
307 | if (!monc->con) | ||
308 | return -ENOMEM; | ||
309 | ceph_con_init(monc->client->msgr, monc->con); | ||
310 | monc->con->private = monc; | ||
311 | monc->con->ops = &mon_con_ops; | ||
312 | } | ||
313 | |||
305 | mutex_lock(&monc->mutex); | 314 | mutex_lock(&monc->mutex); |
306 | __open_session(monc); | 315 | __open_session(monc); |
307 | __schedule_delayed(monc); | 316 | __schedule_delayed(monc); |
@@ -311,17 +320,6 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) | |||
311 | EXPORT_SYMBOL(ceph_monc_open_session); | 320 | EXPORT_SYMBOL(ceph_monc_open_session); |
312 | 321 | ||
313 | /* | 322 | /* |
314 | * We require the fsid and global_id in order to initialize our | ||
315 | * debugfs dir. | ||
316 | */ | ||
317 | static bool have_debugfs_info(struct ceph_mon_client *monc) | ||
318 | { | ||
319 | dout("have_debugfs_info fsid %d globalid %lld\n", | ||
320 | (int)monc->client->have_fsid, monc->auth->global_id); | ||
321 | return monc->client->have_fsid && monc->auth->global_id > 0; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * The monitor responds with mount ack indicate mount success. The | 323 | * The monitor responds with mount ack indicate mount success. The |
326 | * included client ticket allows the client to talk to MDSs and OSDs. | 324 | * included client ticket allows the client to talk to MDSs and OSDs. |
327 | */ | 325 | */ |
@@ -331,12 +329,9 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, | |||
331 | struct ceph_client *client = monc->client; | 329 | struct ceph_client *client = monc->client; |
332 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; | 330 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; |
333 | void *p, *end; | 331 | void *p, *end; |
334 | int had_debugfs_info, init_debugfs = 0; | ||
335 | 332 | ||
336 | mutex_lock(&monc->mutex); | 333 | mutex_lock(&monc->mutex); |
337 | 334 | ||
338 | had_debugfs_info = have_debugfs_info(monc); | ||
339 | |||
340 | dout("handle_monmap\n"); | 335 | dout("handle_monmap\n"); |
341 | p = msg->front.iov_base; | 336 | p = msg->front.iov_base; |
342 | end = p + msg->front.iov_len; | 337 | end = p + msg->front.iov_len; |
@@ -356,29 +351,8 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, | |||
356 | client->monc.monmap = monmap; | 351 | client->monc.monmap = monmap; |
357 | kfree(old); | 352 | kfree(old); |
358 | 353 | ||
359 | if (!client->have_fsid) { | ||
360 | client->have_fsid = true; | ||
361 | if (!had_debugfs_info && have_debugfs_info(monc)) { | ||
362 | pr_info("client%lld fsid %pU\n", | ||
363 | ceph_client_id(monc->client), | ||
364 | &monc->client->fsid); | ||
365 | init_debugfs = 1; | ||
366 | } | ||
367 | mutex_unlock(&monc->mutex); | ||
368 | |||
369 | if (init_debugfs) { | ||
370 | /* | ||
371 | * do debugfs initialization without mutex to avoid | ||
372 | * creating a locking dependency | ||
373 | */ | ||
374 | ceph_debugfs_client_init(monc->client); | ||
375 | } | ||
376 | |||
377 | goto out_unlocked; | ||
378 | } | ||
379 | out: | 354 | out: |
380 | mutex_unlock(&monc->mutex); | 355 | mutex_unlock(&monc->mutex); |
381 | out_unlocked: | ||
382 | wake_up_all(&client->auth_wq); | 356 | wake_up_all(&client->auth_wq); |
383 | } | 357 | } |
384 | 358 | ||
@@ -465,7 +439,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | |||
465 | m = NULL; | 439 | m = NULL; |
466 | } else { | 440 | } else { |
467 | dout("get_generic_reply %lld got %p\n", tid, req->reply); | 441 | dout("get_generic_reply %lld got %p\n", tid, req->reply); |
468 | *skip = 0; | ||
469 | m = ceph_msg_get(req->reply); | 442 | m = ceph_msg_get(req->reply); |
470 | /* | 443 | /* |
471 | * we don't need to track the connection reading into | 444 | * we don't need to track the connection reading into |
@@ -488,7 +461,7 @@ static int do_generic_request(struct ceph_mon_client *monc, | |||
488 | req->request->hdr.tid = cpu_to_le64(req->tid); | 461 | req->request->hdr.tid = cpu_to_le64(req->tid); |
489 | __insert_generic_request(monc, req); | 462 | __insert_generic_request(monc, req); |
490 | monc->num_generic_requests++; | 463 | monc->num_generic_requests++; |
491 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); | 464 | ceph_con_send(monc->con, ceph_msg_get(req->request)); |
492 | mutex_unlock(&monc->mutex); | 465 | mutex_unlock(&monc->mutex); |
493 | 466 | ||
494 | err = wait_for_completion_interruptible(&req->completion); | 467 | err = wait_for_completion_interruptible(&req->completion); |
@@ -555,12 +528,10 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) | |||
555 | init_completion(&req->completion); | 528 | init_completion(&req->completion); |
556 | 529 | ||
557 | err = -ENOMEM; | 530 | err = -ENOMEM; |
558 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, | 531 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); |
559 | true); | ||
560 | if (!req->request) | 532 | if (!req->request) |
561 | goto out; | 533 | goto out; |
562 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, | 534 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); |
563 | true); | ||
564 | if (!req->reply) | 535 | if (!req->reply) |
565 | goto out; | 536 | goto out; |
566 | 537 | ||
@@ -637,7 +608,7 @@ bad: | |||
637 | /* | 608 | /* |
638 | * Do a synchronous pool op. | 609 | * Do a synchronous pool op. |
639 | */ | 610 | */ |
640 | static int do_poolop(struct ceph_mon_client *monc, u32 op, | 611 | int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, |
641 | u32 pool, u64 snapid, | 612 | u32 pool, u64 snapid, |
642 | char *buf, int len) | 613 | char *buf, int len) |
643 | { | 614 | { |
@@ -655,12 +626,10 @@ static int do_poolop(struct ceph_mon_client *monc, u32 op, | |||
655 | init_completion(&req->completion); | 626 | init_completion(&req->completion); |
656 | 627 | ||
657 | err = -ENOMEM; | 628 | err = -ENOMEM; |
658 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, | 629 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); |
659 | true); | ||
660 | if (!req->request) | 630 | if (!req->request) |
661 | goto out; | 631 | goto out; |
662 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, | 632 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); |
663 | true); | ||
664 | if (!req->reply) | 633 | if (!req->reply) |
665 | goto out; | 634 | goto out; |
666 | 635 | ||
@@ -687,7 +656,7 @@ out: | |||
687 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, | 656 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, |
688 | u32 pool, u64 *snapid) | 657 | u32 pool, u64 *snapid) |
689 | { | 658 | { |
690 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 659 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
691 | pool, 0, (char *)snapid, sizeof(*snapid)); | 660 | pool, 0, (char *)snapid, sizeof(*snapid)); |
692 | 661 | ||
693 | } | 662 | } |
@@ -696,7 +665,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid); | |||
696 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | 665 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, |
697 | u32 pool, u64 snapid) | 666 | u32 pool, u64 snapid) |
698 | { | 667 | { |
699 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 668 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
700 | pool, snapid, 0, 0); | 669 | pool, snapid, 0, 0); |
701 | 670 | ||
702 | } | 671 | } |
@@ -711,9 +680,8 @@ static void __resend_generic_request(struct ceph_mon_client *monc) | |||
711 | 680 | ||
712 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { | 681 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { |
713 | req = rb_entry(p, struct ceph_mon_generic_request, node); | 682 | req = rb_entry(p, struct ceph_mon_generic_request, node); |
714 | ceph_msg_revoke(req->request); | 683 | ceph_con_revoke(monc->con, req->request); |
715 | ceph_msg_revoke_incoming(req->reply); | 684 | ceph_con_send(monc->con, ceph_msg_get(req->request)); |
716 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); | ||
717 | } | 685 | } |
718 | } | 686 | } |
719 | 687 | ||
@@ -733,7 +701,7 @@ static void delayed_work(struct work_struct *work) | |||
733 | __close_session(monc); | 701 | __close_session(monc); |
734 | __open_session(monc); /* continue hunting */ | 702 | __open_session(monc); /* continue hunting */ |
735 | } else { | 703 | } else { |
736 | ceph_con_keepalive(&monc->con); | 704 | ceph_con_keepalive(monc->con); |
737 | 705 | ||
738 | __validate_auth(monc); | 706 | __validate_auth(monc); |
739 | 707 | ||
@@ -769,6 +737,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc) | |||
769 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); | 737 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); |
770 | } | 738 | } |
771 | monc->monmap->num_mon = num_mon; | 739 | monc->monmap->num_mon = num_mon; |
740 | monc->have_fsid = false; | ||
772 | return 0; | 741 | return 0; |
773 | } | 742 | } |
774 | 743 | ||
@@ -786,14 +755,13 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | |||
786 | if (err) | 755 | if (err) |
787 | goto out; | 756 | goto out; |
788 | 757 | ||
789 | /* connection */ | 758 | monc->con = NULL; |
759 | |||
790 | /* authentication */ | 760 | /* authentication */ |
791 | monc->auth = ceph_auth_init(cl->options->name, | 761 | monc->auth = ceph_auth_init(cl->options->name, |
792 | cl->options->key); | 762 | cl->options->key); |
793 | if (IS_ERR(monc->auth)) { | 763 | if (IS_ERR(monc->auth)) |
794 | err = PTR_ERR(monc->auth); | 764 | return PTR_ERR(monc->auth); |
795 | goto out_monmap; | ||
796 | } | ||
797 | monc->auth->want_keys = | 765 | monc->auth->want_keys = |
798 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | | 766 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | |
799 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; | 767 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; |
@@ -802,28 +770,23 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | |||
802 | err = -ENOMEM; | 770 | err = -ENOMEM; |
803 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, | 771 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, |
804 | sizeof(struct ceph_mon_subscribe_ack), | 772 | sizeof(struct ceph_mon_subscribe_ack), |
805 | GFP_NOFS, true); | 773 | GFP_NOFS); |
806 | if (!monc->m_subscribe_ack) | 774 | if (!monc->m_subscribe_ack) |
807 | goto out_auth; | 775 | goto out_monmap; |
808 | 776 | ||
809 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, | 777 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); |
810 | true); | ||
811 | if (!monc->m_subscribe) | 778 | if (!monc->m_subscribe) |
812 | goto out_subscribe_ack; | 779 | goto out_subscribe_ack; |
813 | 780 | ||
814 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, | 781 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); |
815 | true); | ||
816 | if (!monc->m_auth_reply) | 782 | if (!monc->m_auth_reply) |
817 | goto out_subscribe; | 783 | goto out_subscribe; |
818 | 784 | ||
819 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); | 785 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); |
820 | monc->pending_auth = 0; | 786 | monc->pending_auth = 0; |
821 | if (!monc->m_auth) | 787 | if (!monc->m_auth) |
822 | goto out_auth_reply; | 788 | goto out_auth_reply; |
823 | 789 | ||
824 | ceph_con_init(&monc->con, monc, &mon_con_ops, | ||
825 | &monc->client->msgr); | ||
826 | |||
827 | monc->cur_mon = -1; | 790 | monc->cur_mon = -1; |
828 | monc->hunting = true; | 791 | monc->hunting = true; |
829 | monc->sub_renew_after = jiffies; | 792 | monc->sub_renew_after = jiffies; |
@@ -845,8 +808,6 @@ out_subscribe: | |||
845 | ceph_msg_put(monc->m_subscribe); | 808 | ceph_msg_put(monc->m_subscribe); |
846 | out_subscribe_ack: | 809 | out_subscribe_ack: |
847 | ceph_msg_put(monc->m_subscribe_ack); | 810 | ceph_msg_put(monc->m_subscribe_ack); |
848 | out_auth: | ||
849 | ceph_auth_destroy(monc->auth); | ||
850 | out_monmap: | 811 | out_monmap: |
851 | kfree(monc->monmap); | 812 | kfree(monc->monmap); |
852 | out: | 813 | out: |
@@ -861,17 +822,13 @@ void ceph_monc_stop(struct ceph_mon_client *monc) | |||
861 | 822 | ||
862 | mutex_lock(&monc->mutex); | 823 | mutex_lock(&monc->mutex); |
863 | __close_session(monc); | 824 | __close_session(monc); |
864 | 825 | if (monc->con) { | |
826 | monc->con->private = NULL; | ||
827 | monc->con->ops->put(monc->con); | ||
828 | monc->con = NULL; | ||
829 | } | ||
865 | mutex_unlock(&monc->mutex); | 830 | mutex_unlock(&monc->mutex); |
866 | 831 | ||
867 | /* | ||
868 | * flush msgr queue before we destroy ourselves to ensure that: | ||
869 | * - any work that references our embedded con is finished. | ||
870 | * - any osd_client or other work that may reference an authorizer | ||
871 | * finishes before we shut down the auth subsystem. | ||
872 | */ | ||
873 | ceph_msgr_flush(); | ||
874 | |||
875 | ceph_auth_destroy(monc->auth); | 832 | ceph_auth_destroy(monc->auth); |
876 | 833 | ||
877 | ceph_msg_put(monc->m_auth); | 834 | ceph_msg_put(monc->m_auth); |
@@ -888,10 +845,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc, | |||
888 | { | 845 | { |
889 | int ret; | 846 | int ret; |
890 | int was_auth = 0; | 847 | int was_auth = 0; |
891 | int had_debugfs_info, init_debugfs = 0; | ||
892 | 848 | ||
893 | mutex_lock(&monc->mutex); | 849 | mutex_lock(&monc->mutex); |
894 | had_debugfs_info = have_debugfs_info(monc); | ||
895 | if (monc->auth->ops) | 850 | if (monc->auth->ops) |
896 | was_auth = monc->auth->ops->is_authenticated(monc->auth); | 851 | was_auth = monc->auth->ops->is_authenticated(monc->auth); |
897 | monc->pending_auth = 0; | 852 | monc->pending_auth = 0; |
@@ -907,29 +862,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc, | |||
907 | } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { | 862 | } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { |
908 | dout("authenticated, starting session\n"); | 863 | dout("authenticated, starting session\n"); |
909 | 864 | ||
910 | monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; | 865 | monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; |
911 | monc->client->msgr.inst.name.num = | 866 | monc->client->msgr->inst.name.num = |
912 | cpu_to_le64(monc->auth->global_id); | 867 | cpu_to_le64(monc->auth->global_id); |
913 | 868 | ||
914 | __send_subscribe(monc); | 869 | __send_subscribe(monc); |
915 | __resend_generic_request(monc); | 870 | __resend_generic_request(monc); |
916 | } | 871 | } |
917 | |||
918 | if (!had_debugfs_info && have_debugfs_info(monc)) { | ||
919 | pr_info("client%lld fsid %pU\n", | ||
920 | ceph_client_id(monc->client), | ||
921 | &monc->client->fsid); | ||
922 | init_debugfs = 1; | ||
923 | } | ||
924 | mutex_unlock(&monc->mutex); | 872 | mutex_unlock(&monc->mutex); |
925 | |||
926 | if (init_debugfs) { | ||
927 | /* | ||
928 | * do debugfs initialization without mutex to avoid | ||
929 | * creating a locking dependency | ||
930 | */ | ||
931 | ceph_debugfs_client_init(monc->client); | ||
932 | } | ||
933 | } | 873 | } |
934 | 874 | ||
935 | static int __validate_auth(struct ceph_mon_client *monc) | 875 | static int __validate_auth(struct ceph_mon_client *monc) |
@@ -1033,9 +973,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, | |||
1033 | case CEPH_MSG_MON_MAP: | 973 | case CEPH_MSG_MON_MAP: |
1034 | case CEPH_MSG_MDS_MAP: | 974 | case CEPH_MSG_MDS_MAP: |
1035 | case CEPH_MSG_OSD_MAP: | 975 | case CEPH_MSG_OSD_MAP: |
1036 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); | 976 | m = ceph_msg_new(type, front_len, GFP_NOFS); |
1037 | if (!m) | ||
1038 | return NULL; /* ENOMEM--return skip == 0 */ | ||
1039 | break; | 977 | break; |
1040 | } | 978 | } |
1041 | 979 | ||
@@ -1062,10 +1000,10 @@ static void mon_fault(struct ceph_connection *con) | |||
1062 | if (!con->private) | 1000 | if (!con->private) |
1063 | goto out; | 1001 | goto out; |
1064 | 1002 | ||
1065 | if (!monc->hunting) | 1003 | if (monc->con && !monc->hunting) |
1066 | pr_info("mon%d %s session lost, " | 1004 | pr_info("mon%d %s session lost, " |
1067 | "hunting for new mon\n", monc->cur_mon, | 1005 | "hunting for new mon\n", monc->cur_mon, |
1068 | ceph_pr_addr(&monc->con.peer_addr.in_addr)); | 1006 | ceph_pr_addr(&monc->con->peer_addr.in_addr)); |
1069 | 1007 | ||
1070 | __close_session(monc); | 1008 | __close_session(monc); |
1071 | if (!monc->hunting) { | 1009 | if (!monc->hunting) { |
@@ -1080,23 +1018,9 @@ out: | |||
1080 | mutex_unlock(&monc->mutex); | 1018 | mutex_unlock(&monc->mutex); |
1081 | } | 1019 | } |
1082 | 1020 | ||
1083 | /* | ||
1084 | * We can ignore refcounting on the connection struct, as all references | ||
1085 | * will come from the messenger workqueue, which is drained prior to | ||
1086 | * mon_client destruction. | ||
1087 | */ | ||
1088 | static struct ceph_connection *con_get(struct ceph_connection *con) | ||
1089 | { | ||
1090 | return con; | ||
1091 | } | ||
1092 | |||
1093 | static void con_put(struct ceph_connection *con) | ||
1094 | { | ||
1095 | } | ||
1096 | |||
1097 | static const struct ceph_connection_operations mon_con_ops = { | 1021 | static const struct ceph_connection_operations mon_con_ops = { |
1098 | .get = con_get, | 1022 | .get = ceph_con_get, |
1099 | .put = con_put, | 1023 | .put = ceph_con_put, |
1100 | .dispatch = dispatch, | 1024 | .dispatch = dispatch, |
1101 | .fault = mon_fault, | 1025 | .fault = mon_fault, |
1102 | .alloc_msg = mon_alloc_msg, | 1026 | .alloc_msg = mon_alloc_msg, |
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index ddec1c10ac8..1f4cb30a42c 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c | |||
@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) | |||
12 | struct ceph_msgpool *pool = arg; | 12 | struct ceph_msgpool *pool = arg; |
13 | struct ceph_msg *msg; | 13 | struct ceph_msg *msg; |
14 | 14 | ||
15 | msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); | 15 | msg = ceph_msg_new(0, pool->front_len, gfp_mask); |
16 | if (!msg) { | 16 | if (!msg) { |
17 | dout("msgpool_alloc %s failed\n", pool->name); | 17 | dout("msgpool_alloc %s failed\n", pool->name); |
18 | } else { | 18 | } else { |
@@ -32,11 +32,10 @@ static void msgpool_free(void *element, void *arg) | |||
32 | ceph_msg_put(msg); | 32 | ceph_msg_put(msg); |
33 | } | 33 | } |
34 | 34 | ||
35 | int ceph_msgpool_init(struct ceph_msgpool *pool, int type, | 35 | int ceph_msgpool_init(struct ceph_msgpool *pool, |
36 | int front_len, int size, bool blocking, const char *name) | 36 | int front_len, int size, bool blocking, const char *name) |
37 | { | 37 | { |
38 | dout("msgpool %s init\n", name); | 38 | dout("msgpool %s init\n", name); |
39 | pool->type = type; | ||
40 | pool->front_len = front_len; | 39 | pool->front_len = front_len; |
41 | pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); | 40 | pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); |
42 | if (!pool->pool) | 41 | if (!pool->pool) |
@@ -62,7 +61,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, | |||
62 | WARN_ON(1); | 61 | WARN_ON(1); |
63 | 62 | ||
64 | /* try to alloc a fresh message */ | 63 | /* try to alloc a fresh message */ |
65 | return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); | 64 | return ceph_msg_new(0, front_len, GFP_NOFS); |
66 | } | 65 | } |
67 | 66 | ||
68 | msg = mempool_alloc(pool->pool, GFP_NOFS); | 67 | msg = mempool_alloc(pool->pool, GFP_NOFS); |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eb9a4447876..88ad8a2501b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -29,8 +29,8 @@ static void __register_request(struct ceph_osd_client *osdc, | |||
29 | struct ceph_osd_request *req); | 29 | struct ceph_osd_request *req); |
30 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | 30 | static void __unregister_linger_request(struct ceph_osd_client *osdc, |
31 | struct ceph_osd_request *req); | 31 | struct ceph_osd_request *req); |
32 | static void __send_request(struct ceph_osd_client *osdc, | 32 | static int __send_request(struct ceph_osd_client *osdc, |
33 | struct ceph_osd_request *req); | 33 | struct ceph_osd_request *req); |
34 | 34 | ||
35 | static int op_needs_trail(int op) | 35 | static int op_needs_trail(int op) |
36 | { | 36 | { |
@@ -52,7 +52,7 @@ static int op_has_extent(int op) | |||
52 | op == CEPH_OSD_OP_WRITE); | 52 | op == CEPH_OSD_OP_WRITE); |
53 | } | 53 | } |
54 | 54 | ||
55 | int ceph_calc_raw_layout(struct ceph_osd_client *osdc, | 55 | void ceph_calc_raw_layout(struct ceph_osd_client *osdc, |
56 | struct ceph_file_layout *layout, | 56 | struct ceph_file_layout *layout, |
57 | u64 snapid, | 57 | u64 snapid, |
58 | u64 off, u64 *plen, u64 *bno, | 58 | u64 off, u64 *plen, u64 *bno, |
@@ -62,15 +62,12 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc, | |||
62 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | 62 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; |
63 | u64 orig_len = *plen; | 63 | u64 orig_len = *plen; |
64 | u64 objoff, objlen; /* extent in object */ | 64 | u64 objoff, objlen; /* extent in object */ |
65 | int r; | ||
66 | 65 | ||
67 | reqhead->snapid = cpu_to_le64(snapid); | 66 | reqhead->snapid = cpu_to_le64(snapid); |
68 | 67 | ||
69 | /* object extent? */ | 68 | /* object extent? */ |
70 | r = ceph_calc_file_object_mapping(layout, off, plen, bno, | 69 | ceph_calc_file_object_mapping(layout, off, plen, bno, |
71 | &objoff, &objlen); | 70 | &objoff, &objlen); |
72 | if (r < 0) | ||
73 | return r; | ||
74 | if (*plen < orig_len) | 71 | if (*plen < orig_len) |
75 | dout(" skipping last %llu, final file extent %llu~%llu\n", | 72 | dout(" skipping last %llu, final file extent %llu~%llu\n", |
76 | orig_len - *plen, off, *plen); | 73 | orig_len - *plen, off, *plen); |
@@ -86,7 +83,7 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc, | |||
86 | 83 | ||
87 | dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", | 84 | dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", |
88 | *bno, objoff, objlen, req->r_num_pages); | 85 | *bno, objoff, objlen, req->r_num_pages); |
89 | return 0; | 86 | |
90 | } | 87 | } |
91 | EXPORT_SYMBOL(ceph_calc_raw_layout); | 88 | EXPORT_SYMBOL(ceph_calc_raw_layout); |
92 | 89 | ||
@@ -115,25 +112,20 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); | |||
115 | * | 112 | * |
116 | * fill osd op in request message. | 113 | * fill osd op in request message. |
117 | */ | 114 | */ |
118 | static int calc_layout(struct ceph_osd_client *osdc, | 115 | static void calc_layout(struct ceph_osd_client *osdc, |
119 | struct ceph_vino vino, | 116 | struct ceph_vino vino, |
120 | struct ceph_file_layout *layout, | 117 | struct ceph_file_layout *layout, |
121 | u64 off, u64 *plen, | 118 | u64 off, u64 *plen, |
122 | struct ceph_osd_request *req, | 119 | struct ceph_osd_request *req, |
123 | struct ceph_osd_req_op *op) | 120 | struct ceph_osd_req_op *op) |
124 | { | 121 | { |
125 | u64 bno; | 122 | u64 bno; |
126 | int r; | ||
127 | 123 | ||
128 | r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, | 124 | ceph_calc_raw_layout(osdc, layout, vino.snap, off, |
129 | plen, &bno, req, op); | 125 | plen, &bno, req, op); |
130 | if (r < 0) | ||
131 | return r; | ||
132 | 126 | ||
133 | snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); | 127 | snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); |
134 | req->r_oid_len = strlen(req->r_oid); | 128 | req->r_oid_len = strlen(req->r_oid); |
135 | |||
136 | return r; | ||
137 | } | 129 | } |
138 | 130 | ||
139 | /* | 131 | /* |
@@ -147,14 +139,15 @@ void ceph_osdc_release_request(struct kref *kref) | |||
147 | 139 | ||
148 | if (req->r_request) | 140 | if (req->r_request) |
149 | ceph_msg_put(req->r_request); | 141 | ceph_msg_put(req->r_request); |
142 | if (req->r_reply) | ||
143 | ceph_msg_put(req->r_reply); | ||
150 | if (req->r_con_filling_msg) { | 144 | if (req->r_con_filling_msg) { |
151 | dout("%s revoking pages %p from con %p\n", __func__, | 145 | dout("release_request revoking pages %p from con %p\n", |
152 | req->r_pages, req->r_con_filling_msg); | 146 | req->r_pages, req->r_con_filling_msg); |
153 | ceph_msg_revoke_incoming(req->r_reply); | 147 | ceph_con_revoke_message(req->r_con_filling_msg, |
154 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); | 148 | req->r_reply); |
149 | ceph_con_put(req->r_con_filling_msg); | ||
155 | } | 150 | } |
156 | if (req->r_reply) | ||
157 | ceph_msg_put(req->r_reply); | ||
158 | if (req->r_own_pages) | 151 | if (req->r_own_pages) |
159 | ceph_release_page_vector(req->r_pages, | 152 | ceph_release_page_vector(req->r_pages, |
160 | req->r_num_pages); | 153 | req->r_num_pages); |
@@ -221,13 +214,10 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
221 | kref_init(&req->r_kref); | 214 | kref_init(&req->r_kref); |
222 | init_completion(&req->r_completion); | 215 | init_completion(&req->r_completion); |
223 | init_completion(&req->r_safe_completion); | 216 | init_completion(&req->r_safe_completion); |
224 | RB_CLEAR_NODE(&req->r_node); | ||
225 | INIT_LIST_HEAD(&req->r_unsafe_item); | 217 | INIT_LIST_HEAD(&req->r_unsafe_item); |
226 | INIT_LIST_HEAD(&req->r_linger_item); | 218 | INIT_LIST_HEAD(&req->r_linger_item); |
227 | INIT_LIST_HEAD(&req->r_linger_osd); | 219 | INIT_LIST_HEAD(&req->r_linger_osd); |
228 | INIT_LIST_HEAD(&req->r_req_lru_item); | 220 | INIT_LIST_HEAD(&req->r_req_lru_item); |
229 | INIT_LIST_HEAD(&req->r_osd_item); | ||
230 | |||
231 | req->r_flags = flags; | 221 | req->r_flags = flags; |
232 | 222 | ||
233 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); | 223 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); |
@@ -237,7 +227,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
237 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | 227 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); |
238 | else | 228 | else |
239 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, | 229 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, |
240 | OSD_OPREPLY_FRONT_LEN, gfp_flags, true); | 230 | OSD_OPREPLY_FRONT_LEN, gfp_flags); |
241 | if (!msg) { | 231 | if (!msg) { |
242 | ceph_osdc_put_request(req); | 232 | ceph_osdc_put_request(req); |
243 | return NULL; | 233 | return NULL; |
@@ -253,20 +243,20 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
253 | } | 243 | } |
254 | ceph_pagelist_init(req->r_trail); | 244 | ceph_pagelist_init(req->r_trail); |
255 | } | 245 | } |
256 | |||
257 | /* create request message; allow space for oid */ | 246 | /* create request message; allow space for oid */ |
258 | msg_size += MAX_OBJ_NAME_SIZE; | 247 | msg_size += 40; |
259 | if (snapc) | 248 | if (snapc) |
260 | msg_size += sizeof(u64) * snapc->num_snaps; | 249 | msg_size += sizeof(u64) * snapc->num_snaps; |
261 | if (use_mempool) | 250 | if (use_mempool) |
262 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | 251 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); |
263 | else | 252 | else |
264 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); | 253 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); |
265 | if (!msg) { | 254 | if (!msg) { |
266 | ceph_osdc_put_request(req); | 255 | ceph_osdc_put_request(req); |
267 | return NULL; | 256 | return NULL; |
268 | } | 257 | } |
269 | 258 | ||
259 | msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); | ||
270 | memset(msg->front.iov_base, 0, msg->front.iov_len); | 260 | memset(msg->front.iov_base, 0, msg->front.iov_len); |
271 | 261 | ||
272 | req->r_request = msg; | 262 | req->r_request = msg; |
@@ -288,7 +278,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
288 | { | 278 | { |
289 | dst->op = cpu_to_le16(src->op); | 279 | dst->op = cpu_to_le16(src->op); |
290 | 280 | ||
291 | switch (src->op) { | 281 | switch (dst->op) { |
292 | case CEPH_OSD_OP_READ: | 282 | case CEPH_OSD_OP_READ: |
293 | case CEPH_OSD_OP_WRITE: | 283 | case CEPH_OSD_OP_WRITE: |
294 | dst->extent.offset = | 284 | dst->extent.offset = |
@@ -464,7 +454,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
464 | { | 454 | { |
465 | struct ceph_osd_req_op ops[3]; | 455 | struct ceph_osd_req_op ops[3]; |
466 | struct ceph_osd_request *req; | 456 | struct ceph_osd_request *req; |
467 | int r; | ||
468 | 457 | ||
469 | ops[0].op = opcode; | 458 | ops[0].op = opcode; |
470 | ops[0].extent.truncate_seq = truncate_seq; | 459 | ops[0].extent.truncate_seq = truncate_seq; |
@@ -483,12 +472,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
483 | use_mempool, | 472 | use_mempool, |
484 | GFP_NOFS, NULL, NULL); | 473 | GFP_NOFS, NULL, NULL); |
485 | if (!req) | 474 | if (!req) |
486 | return ERR_PTR(-ENOMEM); | 475 | return NULL; |
487 | 476 | ||
488 | /* calculate max write size */ | 477 | /* calculate max write size */ |
489 | r = calc_layout(osdc, vino, layout, off, plen, req, ops); | 478 | calc_layout(osdc, vino, layout, off, plen, req, ops); |
490 | if (r < 0) | ||
491 | return ERR_PTR(r); | ||
492 | req->r_file_layout = *layout; /* keep a copy */ | 479 | req->r_file_layout = *layout; /* keep a copy */ |
493 | 480 | ||
494 | /* in case it differs from natural (file) alignment that | 481 | /* in case it differs from natural (file) alignment that |
@@ -581,7 +568,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, | |||
581 | 568 | ||
582 | dout("__kick_osd_requests osd%d\n", osd->o_osd); | 569 | dout("__kick_osd_requests osd%d\n", osd->o_osd); |
583 | err = __reset_osd(osdc, osd); | 570 | err = __reset_osd(osdc, osd); |
584 | if (err) | 571 | if (err == -EAGAIN) |
585 | return; | 572 | return; |
586 | 573 | ||
587 | list_for_each_entry(req, &osd->o_requests, r_osd_item) { | 574 | list_for_each_entry(req, &osd->o_requests, r_osd_item) { |
@@ -608,6 +595,14 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, | |||
608 | } | 595 | } |
609 | } | 596 | } |
610 | 597 | ||
598 | static void kick_osd_requests(struct ceph_osd_client *osdc, | ||
599 | struct ceph_osd *kickosd) | ||
600 | { | ||
601 | mutex_lock(&osdc->request_mutex); | ||
602 | __kick_osd_requests(osdc, kickosd); | ||
603 | mutex_unlock(&osdc->request_mutex); | ||
604 | } | ||
605 | |||
611 | /* | 606 | /* |
612 | * If the osd connection drops, we need to resubmit all requests. | 607 | * If the osd connection drops, we need to resubmit all requests. |
613 | */ | 608 | */ |
@@ -621,9 +616,7 @@ static void osd_reset(struct ceph_connection *con) | |||
621 | dout("osd_reset osd%d\n", osd->o_osd); | 616 | dout("osd_reset osd%d\n", osd->o_osd); |
622 | osdc = osd->o_osdc; | 617 | osdc = osd->o_osdc; |
623 | down_read(&osdc->map_sem); | 618 | down_read(&osdc->map_sem); |
624 | mutex_lock(&osdc->request_mutex); | 619 | kick_osd_requests(osdc, osd); |
625 | __kick_osd_requests(osdc, osd); | ||
626 | mutex_unlock(&osdc->request_mutex); | ||
627 | send_queued(osdc); | 620 | send_queued(osdc); |
628 | up_read(&osdc->map_sem); | 621 | up_read(&osdc->map_sem); |
629 | } | 622 | } |
@@ -631,7 +624,7 @@ static void osd_reset(struct ceph_connection *con) | |||
631 | /* | 624 | /* |
632 | * Track open sessions with osds. | 625 | * Track open sessions with osds. |
633 | */ | 626 | */ |
634 | static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) | 627 | static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) |
635 | { | 628 | { |
636 | struct ceph_osd *osd; | 629 | struct ceph_osd *osd; |
637 | 630 | ||
@@ -641,14 +634,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) | |||
641 | 634 | ||
642 | atomic_set(&osd->o_ref, 1); | 635 | atomic_set(&osd->o_ref, 1); |
643 | osd->o_osdc = osdc; | 636 | osd->o_osdc = osdc; |
644 | osd->o_osd = onum; | ||
645 | RB_CLEAR_NODE(&osd->o_node); | ||
646 | INIT_LIST_HEAD(&osd->o_requests); | 637 | INIT_LIST_HEAD(&osd->o_requests); |
647 | INIT_LIST_HEAD(&osd->o_linger_requests); | 638 | INIT_LIST_HEAD(&osd->o_linger_requests); |
648 | INIT_LIST_HEAD(&osd->o_osd_lru); | 639 | INIT_LIST_HEAD(&osd->o_osd_lru); |
649 | osd->o_incarnation = 1; | 640 | osd->o_incarnation = 1; |
650 | 641 | ||
651 | ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); | 642 | ceph_con_init(osdc->client->msgr, &osd->o_con); |
643 | osd->o_con.private = osd; | ||
644 | osd->o_con.ops = &osd_con_ops; | ||
645 | osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; | ||
652 | 646 | ||
653 | INIT_LIST_HEAD(&osd->o_keepalive_item); | 647 | INIT_LIST_HEAD(&osd->o_keepalive_item); |
654 | return osd; | 648 | return osd; |
@@ -670,11 +664,11 @@ static void put_osd(struct ceph_osd *osd) | |||
670 | { | 664 | { |
671 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), | 665 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), |
672 | atomic_read(&osd->o_ref) - 1); | 666 | atomic_read(&osd->o_ref) - 1); |
673 | if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { | 667 | if (atomic_dec_and_test(&osd->o_ref)) { |
674 | struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; | 668 | struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; |
675 | 669 | ||
676 | if (ac->ops && ac->ops->destroy_authorizer) | 670 | if (osd->o_authorizer) |
677 | ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); | 671 | ac->ops->destroy_authorizer(ac, osd->o_authorizer); |
678 | kfree(osd); | 672 | kfree(osd); |
679 | } | 673 | } |
680 | } | 674 | } |
@@ -694,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | |||
694 | 688 | ||
695 | static void remove_all_osds(struct ceph_osd_client *osdc) | 689 | static void remove_all_osds(struct ceph_osd_client *osdc) |
696 | { | 690 | { |
697 | dout("%s %p\n", __func__, osdc); | 691 | dout("__remove_old_osds %p\n", osdc); |
698 | mutex_lock(&osdc->request_mutex); | 692 | mutex_lock(&osdc->request_mutex); |
699 | while (!RB_EMPTY_ROOT(&osdc->osds)) { | 693 | while (!RB_EMPTY_ROOT(&osdc->osds)) { |
700 | struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), | 694 | struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), |
@@ -746,7 +740,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | |||
746 | if (list_empty(&osd->o_requests) && | 740 | if (list_empty(&osd->o_requests) && |
747 | list_empty(&osd->o_linger_requests)) { | 741 | list_empty(&osd->o_linger_requests)) { |
748 | __remove_osd(osdc, osd); | 742 | __remove_osd(osdc, osd); |
749 | ret = -ENODEV; | ||
750 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], | 743 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], |
751 | &osd->o_con.peer_addr, | 744 | &osd->o_con.peer_addr, |
752 | sizeof(osd->o_con.peer_addr)) == 0 && | 745 | sizeof(osd->o_con.peer_addr)) == 0 && |
@@ -759,8 +752,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | |||
759 | ret = -EAGAIN; | 752 | ret = -EAGAIN; |
760 | } else { | 753 | } else { |
761 | ceph_con_close(&osd->o_con); | 754 | ceph_con_close(&osd->o_con); |
762 | ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, | 755 | ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); |
763 | &osdc->osdmap->osd_addr[osd->o_osd]); | ||
764 | osd->o_incarnation++; | 756 | osd->o_incarnation++; |
765 | } | 757 | } |
766 | return ret; | 758 | return ret; |
@@ -849,19 +841,13 @@ static void register_request(struct ceph_osd_client *osdc, | |||
849 | static void __unregister_request(struct ceph_osd_client *osdc, | 841 | static void __unregister_request(struct ceph_osd_client *osdc, |
850 | struct ceph_osd_request *req) | 842 | struct ceph_osd_request *req) |
851 | { | 843 | { |
852 | if (RB_EMPTY_NODE(&req->r_node)) { | ||
853 | dout("__unregister_request %p tid %lld not registered\n", | ||
854 | req, req->r_tid); | ||
855 | return; | ||
856 | } | ||
857 | |||
858 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 844 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); |
859 | rb_erase(&req->r_node, &osdc->requests); | 845 | rb_erase(&req->r_node, &osdc->requests); |
860 | osdc->num_requests--; | 846 | osdc->num_requests--; |
861 | 847 | ||
862 | if (req->r_osd) { | 848 | if (req->r_osd) { |
863 | /* make sure the original request isn't in flight. */ | 849 | /* make sure the original request isn't in flight. */ |
864 | ceph_msg_revoke(req->r_request); | 850 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); |
865 | 851 | ||
866 | list_del_init(&req->r_osd_item); | 852 | list_del_init(&req->r_osd_item); |
867 | if (list_empty(&req->r_osd->o_requests) && | 853 | if (list_empty(&req->r_osd->o_requests) && |
@@ -873,9 +859,9 @@ static void __unregister_request(struct ceph_osd_client *osdc, | |||
873 | req->r_osd = NULL; | 859 | req->r_osd = NULL; |
874 | } | 860 | } |
875 | 861 | ||
876 | list_del_init(&req->r_req_lru_item); | ||
877 | ceph_osdc_put_request(req); | 862 | ceph_osdc_put_request(req); |
878 | 863 | ||
864 | list_del_init(&req->r_req_lru_item); | ||
879 | if (osdc->num_requests == 0) { | 865 | if (osdc->num_requests == 0) { |
880 | dout(" no requests, canceling timeout\n"); | 866 | dout(" no requests, canceling timeout\n"); |
881 | __cancel_osd_timeout(osdc); | 867 | __cancel_osd_timeout(osdc); |
@@ -888,7 +874,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, | |||
888 | static void __cancel_request(struct ceph_osd_request *req) | 874 | static void __cancel_request(struct ceph_osd_request *req) |
889 | { | 875 | { |
890 | if (req->r_sent && req->r_osd) { | 876 | if (req->r_sent && req->r_osd) { |
891 | ceph_msg_revoke(req->r_request); | 877 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); |
892 | req->r_sent = 0; | 878 | req->r_sent = 0; |
893 | } | 879 | } |
894 | } | 880 | } |
@@ -898,17 +884,15 @@ static void __register_linger_request(struct ceph_osd_client *osdc, | |||
898 | { | 884 | { |
899 | dout("__register_linger_request %p\n", req); | 885 | dout("__register_linger_request %p\n", req); |
900 | list_add_tail(&req->r_linger_item, &osdc->req_linger); | 886 | list_add_tail(&req->r_linger_item, &osdc->req_linger); |
901 | if (req->r_osd) | 887 | list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); |
902 | list_add_tail(&req->r_linger_osd, | ||
903 | &req->r_osd->o_linger_requests); | ||
904 | } | 888 | } |
905 | 889 | ||
906 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | 890 | static void __unregister_linger_request(struct ceph_osd_client *osdc, |
907 | struct ceph_osd_request *req) | 891 | struct ceph_osd_request *req) |
908 | { | 892 | { |
909 | dout("__unregister_linger_request %p\n", req); | 893 | dout("__unregister_linger_request %p\n", req); |
910 | list_del_init(&req->r_linger_item); | ||
911 | if (req->r_osd) { | 894 | if (req->r_osd) { |
895 | list_del_init(&req->r_linger_item); | ||
912 | list_del_init(&req->r_linger_osd); | 896 | list_del_init(&req->r_linger_osd); |
913 | 897 | ||
914 | if (list_empty(&req->r_osd->o_requests) && | 898 | if (list_empty(&req->r_osd->o_requests) && |
@@ -959,7 +943,7 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger); | |||
959 | * Caller should hold map_sem for read and request_mutex. | 943 | * Caller should hold map_sem for read and request_mutex. |
960 | */ | 944 | */ |
961 | static int __map_request(struct ceph_osd_client *osdc, | 945 | static int __map_request(struct ceph_osd_client *osdc, |
962 | struct ceph_osd_request *req, int force_resend) | 946 | struct ceph_osd_request *req) |
963 | { | 947 | { |
964 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | 948 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; |
965 | struct ceph_pg pgid; | 949 | struct ceph_pg pgid; |
@@ -983,8 +967,7 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
983 | num = err; | 967 | num = err; |
984 | } | 968 | } |
985 | 969 | ||
986 | if ((!force_resend && | 970 | if ((req->r_osd && req->r_osd->o_osd == o && |
987 | req->r_osd && req->r_osd->o_osd == o && | ||
988 | req->r_sent >= req->r_osd->o_incarnation && | 971 | req->r_sent >= req->r_osd->o_incarnation && |
989 | req->r_num_pg_osds == num && | 972 | req->r_num_pg_osds == num && |
990 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || | 973 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || |
@@ -1008,18 +991,18 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
1008 | req->r_osd = __lookup_osd(osdc, o); | 991 | req->r_osd = __lookup_osd(osdc, o); |
1009 | if (!req->r_osd && o >= 0) { | 992 | if (!req->r_osd && o >= 0) { |
1010 | err = -ENOMEM; | 993 | err = -ENOMEM; |
1011 | req->r_osd = create_osd(osdc, o); | 994 | req->r_osd = create_osd(osdc); |
1012 | if (!req->r_osd) { | 995 | if (!req->r_osd) { |
1013 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 996 | list_move(&req->r_req_lru_item, &osdc->req_notarget); |
1014 | goto out; | 997 | goto out; |
1015 | } | 998 | } |
1016 | 999 | ||
1017 | dout("map_request osd %p is osd%d\n", req->r_osd, o); | 1000 | dout("map_request osd %p is osd%d\n", req->r_osd, o); |
1001 | req->r_osd->o_osd = o; | ||
1002 | req->r_osd->o_con.peer_name.num = cpu_to_le64(o); | ||
1018 | __insert_osd(osdc, req->r_osd); | 1003 | __insert_osd(osdc, req->r_osd); |
1019 | 1004 | ||
1020 | ceph_con_open(&req->r_osd->o_con, | 1005 | ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); |
1021 | CEPH_ENTITY_TYPE_OSD, o, | ||
1022 | &osdc->osdmap->osd_addr[o]); | ||
1023 | } | 1006 | } |
1024 | 1007 | ||
1025 | if (req->r_osd) { | 1008 | if (req->r_osd) { |
@@ -1038,8 +1021,8 @@ out: | |||
1038 | /* | 1021 | /* |
1039 | * caller should hold map_sem (for read) and request_mutex | 1022 | * caller should hold map_sem (for read) and request_mutex |
1040 | */ | 1023 | */ |
1041 | static void __send_request(struct ceph_osd_client *osdc, | 1024 | static int __send_request(struct ceph_osd_client *osdc, |
1042 | struct ceph_osd_request *req) | 1025 | struct ceph_osd_request *req) |
1043 | { | 1026 | { |
1044 | struct ceph_osd_request_head *reqhead; | 1027 | struct ceph_osd_request_head *reqhead; |
1045 | 1028 | ||
@@ -1057,6 +1040,7 @@ static void __send_request(struct ceph_osd_client *osdc, | |||
1057 | ceph_msg_get(req->r_request); /* send consumes a ref */ | 1040 | ceph_msg_get(req->r_request); /* send consumes a ref */ |
1058 | ceph_con_send(&req->r_osd->o_con, req->r_request); | 1041 | ceph_con_send(&req->r_osd->o_con, req->r_request); |
1059 | req->r_sent = req->r_osd->o_incarnation; | 1042 | req->r_sent = req->r_osd->o_incarnation; |
1043 | return 0; | ||
1060 | } | 1044 | } |
1061 | 1045 | ||
1062 | /* | 1046 | /* |
@@ -1087,10 +1071,12 @@ static void handle_timeout(struct work_struct *work) | |||
1087 | { | 1071 | { |
1088 | struct ceph_osd_client *osdc = | 1072 | struct ceph_osd_client *osdc = |
1089 | container_of(work, struct ceph_osd_client, timeout_work.work); | 1073 | container_of(work, struct ceph_osd_client, timeout_work.work); |
1090 | struct ceph_osd_request *req; | 1074 | struct ceph_osd_request *req, *last_req = NULL; |
1091 | struct ceph_osd *osd; | 1075 | struct ceph_osd *osd; |
1076 | unsigned long timeout = osdc->client->options->osd_timeout * HZ; | ||
1092 | unsigned long keepalive = | 1077 | unsigned long keepalive = |
1093 | osdc->client->options->osd_keepalive_timeout * HZ; | 1078 | osdc->client->options->osd_keepalive_timeout * HZ; |
1079 | unsigned long last_stamp = 0; | ||
1094 | struct list_head slow_osds; | 1080 | struct list_head slow_osds; |
1095 | dout("timeout\n"); | 1081 | dout("timeout\n"); |
1096 | down_read(&osdc->map_sem); | 1082 | down_read(&osdc->map_sem); |
@@ -1100,6 +1086,37 @@ static void handle_timeout(struct work_struct *work) | |||
1100 | mutex_lock(&osdc->request_mutex); | 1086 | mutex_lock(&osdc->request_mutex); |
1101 | 1087 | ||
1102 | /* | 1088 | /* |
1089 | * reset osds that appear to be _really_ unresponsive. this | ||
1090 | * is a failsafe measure.. we really shouldn't be getting to | ||
1091 | * this point if the system is working properly. the monitors | ||
1092 | * should mark the osd as failed and we should find out about | ||
1093 | * it from an updated osd map. | ||
1094 | */ | ||
1095 | while (timeout && !list_empty(&osdc->req_lru)) { | ||
1096 | req = list_entry(osdc->req_lru.next, struct ceph_osd_request, | ||
1097 | r_req_lru_item); | ||
1098 | |||
1099 | /* hasn't been long enough since we sent it? */ | ||
1100 | if (time_before(jiffies, req->r_stamp + timeout)) | ||
1101 | break; | ||
1102 | |||
1103 | /* hasn't been long enough since it was acked? */ | ||
1104 | if (req->r_request->ack_stamp == 0 || | ||
1105 | time_before(jiffies, req->r_request->ack_stamp + timeout)) | ||
1106 | break; | ||
1107 | |||
1108 | BUG_ON(req == last_req && req->r_stamp == last_stamp); | ||
1109 | last_req = req; | ||
1110 | last_stamp = req->r_stamp; | ||
1111 | |||
1112 | osd = req->r_osd; | ||
1113 | BUG_ON(!osd); | ||
1114 | pr_warning(" tid %llu timed out on osd%d, will reset osd\n", | ||
1115 | req->r_tid, osd->o_osd); | ||
1116 | __kick_osd_requests(osdc, osd); | ||
1117 | } | ||
1118 | |||
1119 | /* | ||
1103 | * ping osds that are a bit slow. this ensures that if there | 1120 | * ping osds that are a bit slow. this ensures that if there |
1104 | * is a break in the TCP connection we will notice, and reopen | 1121 | * is a break in the TCP connection we will notice, and reopen |
1105 | * a connection with that osd (from the fault callback). | 1122 | * a connection with that osd (from the fault callback). |
@@ -1193,11 +1210,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1193 | if (req->r_con_filling_msg == con && req->r_reply == msg) { | 1210 | if (req->r_con_filling_msg == con && req->r_reply == msg) { |
1194 | dout(" dropping con_filling_msg ref %p\n", con); | 1211 | dout(" dropping con_filling_msg ref %p\n", con); |
1195 | req->r_con_filling_msg = NULL; | 1212 | req->r_con_filling_msg = NULL; |
1196 | con->ops->put(con); | 1213 | ceph_con_put(con); |
1197 | } | 1214 | } |
1198 | 1215 | ||
1199 | if (!req->r_got_reply) { | 1216 | if (!req->r_got_reply) { |
1200 | unsigned int bytes; | 1217 | unsigned bytes; |
1201 | 1218 | ||
1202 | req->r_result = le32_to_cpu(rhead->result); | 1219 | req->r_result = le32_to_cpu(rhead->result); |
1203 | bytes = le32_to_cpu(msg->hdr.data_len); | 1220 | bytes = le32_to_cpu(msg->hdr.data_len); |
@@ -1270,51 +1287,30 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) | |||
1270 | * Requeue requests whose mapping to an OSD has changed. If requests map to | 1287 | * Requeue requests whose mapping to an OSD has changed. If requests map to |
1271 | * no osd, request a new map. | 1288 | * no osd, request a new map. |
1272 | * | 1289 | * |
1273 | * Caller should hold map_sem for read. | 1290 | * Caller should hold map_sem for read and request_mutex. |
1274 | */ | 1291 | */ |
1275 | static void kick_requests(struct ceph_osd_client *osdc, int force_resend) | 1292 | static void kick_requests(struct ceph_osd_client *osdc) |
1276 | { | 1293 | { |
1277 | struct ceph_osd_request *req, *nreq; | 1294 | struct ceph_osd_request *req, *nreq; |
1278 | struct rb_node *p; | 1295 | struct rb_node *p; |
1279 | int needmap = 0; | 1296 | int needmap = 0; |
1280 | int err; | 1297 | int err; |
1281 | 1298 | ||
1282 | dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); | 1299 | dout("kick_requests\n"); |
1283 | mutex_lock(&osdc->request_mutex); | 1300 | mutex_lock(&osdc->request_mutex); |
1284 | for (p = rb_first(&osdc->requests); p; ) { | 1301 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { |
1285 | req = rb_entry(p, struct ceph_osd_request, r_node); | 1302 | req = rb_entry(p, struct ceph_osd_request, r_node); |
1286 | p = rb_next(p); | 1303 | err = __map_request(osdc, req); |
1287 | |||
1288 | /* | ||
1289 | * For linger requests that have not yet been | ||
1290 | * registered, move them to the linger list; they'll | ||
1291 | * be sent to the osd in the loop below. Unregister | ||
1292 | * the request before re-registering it as a linger | ||
1293 | * request to ensure the __map_request() below | ||
1294 | * will decide it needs to be sent. | ||
1295 | */ | ||
1296 | if (req->r_linger && list_empty(&req->r_linger_item)) { | ||
1297 | dout("%p tid %llu restart on osd%d\n", | ||
1298 | req, req->r_tid, | ||
1299 | req->r_osd ? req->r_osd->o_osd : -1); | ||
1300 | __unregister_request(osdc, req); | ||
1301 | __register_linger_request(osdc, req); | ||
1302 | continue; | ||
1303 | } | ||
1304 | |||
1305 | err = __map_request(osdc, req, force_resend); | ||
1306 | if (err < 0) | 1304 | if (err < 0) |
1307 | continue; /* error */ | 1305 | continue; /* error */ |
1308 | if (req->r_osd == NULL) { | 1306 | if (req->r_osd == NULL) { |
1309 | dout("%p tid %llu maps to no osd\n", req, req->r_tid); | 1307 | dout("%p tid %llu maps to no osd\n", req, req->r_tid); |
1310 | needmap++; /* request a newer map */ | 1308 | needmap++; /* request a newer map */ |
1311 | } else if (err > 0) { | 1309 | } else if (err > 0) { |
1312 | if (!req->r_linger) { | 1310 | dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, |
1313 | dout("%p tid %llu requeued on osd%d\n", req, | 1311 | req->r_osd ? req->r_osd->o_osd : -1); |
1314 | req->r_tid, | 1312 | if (!req->r_linger) |
1315 | req->r_osd ? req->r_osd->o_osd : -1); | ||
1316 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | 1313 | req->r_flags |= CEPH_OSD_FLAG_RETRY; |
1317 | } | ||
1318 | } | 1314 | } |
1319 | } | 1315 | } |
1320 | 1316 | ||
@@ -1322,8 +1318,7 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) | |||
1322 | r_linger_item) { | 1318 | r_linger_item) { |
1323 | dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); | 1319 | dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); |
1324 | 1320 | ||
1325 | err = __map_request(osdc, req, force_resend); | 1321 | err = __map_request(osdc, req); |
1326 | dout("__map_request returned %d\n", err); | ||
1327 | if (err == 0) | 1322 | if (err == 0) |
1328 | continue; /* no change and no osd was specified */ | 1323 | continue; /* no change and no osd was specified */ |
1329 | if (err < 0) | 1324 | if (err < 0) |
@@ -1336,8 +1331,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) | |||
1336 | 1331 | ||
1337 | dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, | 1332 | dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, |
1338 | req->r_osd ? req->r_osd->o_osd : -1); | 1333 | req->r_osd ? req->r_osd->o_osd : -1); |
1339 | __register_request(osdc, req); | ||
1340 | __unregister_linger_request(osdc, req); | 1334 | __unregister_linger_request(osdc, req); |
1335 | __register_request(osdc, req); | ||
1341 | } | 1336 | } |
1342 | mutex_unlock(&osdc->request_mutex); | 1337 | mutex_unlock(&osdc->request_mutex); |
1343 | 1338 | ||
@@ -1345,7 +1340,6 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) | |||
1345 | dout("%d requests for down osds, need new map\n", needmap); | 1340 | dout("%d requests for down osds, need new map\n", needmap); |
1346 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1341 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1347 | } | 1342 | } |
1348 | reset_changed_osds(osdc); | ||
1349 | } | 1343 | } |
1350 | 1344 | ||
1351 | 1345 | ||
@@ -1391,7 +1385,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1391 | epoch, maplen); | 1385 | epoch, maplen); |
1392 | newmap = osdmap_apply_incremental(&p, next, | 1386 | newmap = osdmap_apply_incremental(&p, next, |
1393 | osdc->osdmap, | 1387 | osdc->osdmap, |
1394 | &osdc->client->msgr); | 1388 | osdc->client->msgr); |
1395 | if (IS_ERR(newmap)) { | 1389 | if (IS_ERR(newmap)) { |
1396 | err = PTR_ERR(newmap); | 1390 | err = PTR_ERR(newmap); |
1397 | goto bad; | 1391 | goto bad; |
@@ -1401,7 +1395,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1401 | ceph_osdmap_destroy(osdc->osdmap); | 1395 | ceph_osdmap_destroy(osdc->osdmap); |
1402 | osdc->osdmap = newmap; | 1396 | osdc->osdmap = newmap; |
1403 | } | 1397 | } |
1404 | kick_requests(osdc, 0); | 1398 | kick_requests(osdc); |
1399 | reset_changed_osds(osdc); | ||
1405 | } else { | 1400 | } else { |
1406 | dout("ignoring incremental map %u len %d\n", | 1401 | dout("ignoring incremental map %u len %d\n", |
1407 | epoch, maplen); | 1402 | epoch, maplen); |
@@ -1428,8 +1423,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1428 | "older than our %u\n", epoch, maplen, | 1423 | "older than our %u\n", epoch, maplen, |
1429 | osdc->osdmap->epoch); | 1424 | osdc->osdmap->epoch); |
1430 | } else { | 1425 | } else { |
1431 | int skipped_map = 0; | ||
1432 | |||
1433 | dout("taking full map %u len %d\n", epoch, maplen); | 1426 | dout("taking full map %u len %d\n", epoch, maplen); |
1434 | newmap = osdmap_decode(&p, p+maplen); | 1427 | newmap = osdmap_decode(&p, p+maplen); |
1435 | if (IS_ERR(newmap)) { | 1428 | if (IS_ERR(newmap)) { |
@@ -1439,12 +1432,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1439 | BUG_ON(!newmap); | 1432 | BUG_ON(!newmap); |
1440 | oldmap = osdc->osdmap; | 1433 | oldmap = osdc->osdmap; |
1441 | osdc->osdmap = newmap; | 1434 | osdc->osdmap = newmap; |
1442 | if (oldmap) { | 1435 | if (oldmap) |
1443 | if (oldmap->epoch + 1 < newmap->epoch) | ||
1444 | skipped_map = 1; | ||
1445 | ceph_osdmap_destroy(oldmap); | 1436 | ceph_osdmap_destroy(oldmap); |
1446 | } | 1437 | kick_requests(osdc); |
1447 | kick_requests(osdc, skipped_map); | ||
1448 | } | 1438 | } |
1449 | p += maplen; | 1439 | p += maplen; |
1450 | nr_maps--; | 1440 | nr_maps--; |
@@ -1571,7 +1561,6 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc, | |||
1571 | event->data = data; | 1561 | event->data = data; |
1572 | event->osdc = osdc; | 1562 | event->osdc = osdc; |
1573 | INIT_LIST_HEAD(&event->osd_node); | 1563 | INIT_LIST_HEAD(&event->osd_node); |
1574 | RB_CLEAR_NODE(&event->node); | ||
1575 | kref_init(&event->kref); /* one ref for us */ | 1564 | kref_init(&event->kref); /* one ref for us */ |
1576 | kref_get(&event->kref); /* one ref for the caller */ | 1565 | kref_get(&event->kref); /* one ref for the caller */ |
1577 | init_completion(&event->completion); | 1566 | init_completion(&event->completion); |
@@ -1718,7 +1707,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, | |||
1718 | * the request still han't been touched yet. | 1707 | * the request still han't been touched yet. |
1719 | */ | 1708 | */ |
1720 | if (req->r_sent == 0) { | 1709 | if (req->r_sent == 0) { |
1721 | rc = __map_request(osdc, req, 0); | 1710 | rc = __map_request(osdc, req); |
1722 | if (rc < 0) { | 1711 | if (rc < 0) { |
1723 | if (nofail) { | 1712 | if (nofail) { |
1724 | dout("osdc_start_request failed map, " | 1713 | dout("osdc_start_request failed map, " |
@@ -1731,9 +1720,17 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, | |||
1731 | dout("send_request %p no up osds in pg\n", req); | 1720 | dout("send_request %p no up osds in pg\n", req); |
1732 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1721 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1733 | } else { | 1722 | } else { |
1734 | __send_request(osdc, req); | 1723 | rc = __send_request(osdc, req); |
1724 | if (rc) { | ||
1725 | if (nofail) { | ||
1726 | dout("osdc_start_request failed send, " | ||
1727 | " will retry %lld\n", req->r_tid); | ||
1728 | rc = 0; | ||
1729 | } else { | ||
1730 | __unregister_request(osdc, req); | ||
1731 | } | ||
1732 | } | ||
1735 | } | 1733 | } |
1736 | rc = 0; | ||
1737 | } | 1734 | } |
1738 | 1735 | ||
1739 | out_unlock: | 1736 | out_unlock: |
@@ -1839,12 +1836,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | |||
1839 | if (!osdc->req_mempool) | 1836 | if (!osdc->req_mempool) |
1840 | goto out; | 1837 | goto out; |
1841 | 1838 | ||
1842 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, | 1839 | err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, |
1843 | OSD_OP_FRONT_LEN, 10, true, | ||
1844 | "osd_op"); | 1840 | "osd_op"); |
1845 | if (err < 0) | 1841 | if (err < 0) |
1846 | goto out_mempool; | 1842 | goto out_mempool; |
1847 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, | 1843 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, |
1848 | OSD_OPREPLY_FRONT_LEN, 10, true, | 1844 | OSD_OPREPLY_FRONT_LEN, 10, true, |
1849 | "osd_op_reply"); | 1845 | "osd_op_reply"); |
1850 | if (err < 0) | 1846 | if (err < 0) |
@@ -1903,8 +1899,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, | |||
1903 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | 1899 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
1904 | NULL, 0, truncate_seq, truncate_size, NULL, | 1900 | NULL, 0, truncate_seq, truncate_size, NULL, |
1905 | false, 1, page_align); | 1901 | false, 1, page_align); |
1906 | if (IS_ERR(req)) | 1902 | if (!req) |
1907 | return PTR_ERR(req); | 1903 | return -ENOMEM; |
1908 | 1904 | ||
1909 | /* it may be a short read due to an object boundary */ | 1905 | /* it may be a short read due to an object boundary */ |
1910 | req->r_pages = pages; | 1906 | req->r_pages = pages; |
@@ -1946,8 +1942,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
1946 | snapc, do_sync, | 1942 | snapc, do_sync, |
1947 | truncate_seq, truncate_size, mtime, | 1943 | truncate_seq, truncate_size, mtime, |
1948 | nofail, 1, page_align); | 1944 | nofail, 1, page_align); |
1949 | if (IS_ERR(req)) | 1945 | if (!req) |
1950 | return PTR_ERR(req); | 1946 | return -ENOMEM; |
1951 | 1947 | ||
1952 | /* it may be a short write due to an object boundary */ | 1948 | /* it may be a short write due to an object boundary */ |
1953 | req->r_pages = pages; | 1949 | req->r_pages = pages; |
@@ -2020,23 +2016,23 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
2020 | if (!req) { | 2016 | if (!req) { |
2021 | *skip = 1; | 2017 | *skip = 1; |
2022 | m = NULL; | 2018 | m = NULL; |
2023 | dout("get_reply unknown tid %llu from osd%d\n", tid, | 2019 | pr_info("get_reply unknown tid %llu from osd%d\n", tid, |
2024 | osd->o_osd); | 2020 | osd->o_osd); |
2025 | goto out; | 2021 | goto out; |
2026 | } | 2022 | } |
2027 | 2023 | ||
2028 | if (req->r_con_filling_msg) { | 2024 | if (req->r_con_filling_msg) { |
2029 | dout("%s revoking msg %p from old con %p\n", __func__, | 2025 | dout("get_reply revoking msg %p from old con %p\n", |
2030 | req->r_reply, req->r_con_filling_msg); | 2026 | req->r_reply, req->r_con_filling_msg); |
2031 | ceph_msg_revoke_incoming(req->r_reply); | 2027 | ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); |
2032 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); | 2028 | ceph_con_put(req->r_con_filling_msg); |
2033 | req->r_con_filling_msg = NULL; | 2029 | req->r_con_filling_msg = NULL; |
2034 | } | 2030 | } |
2035 | 2031 | ||
2036 | if (front > req->r_reply->front.iov_len) { | 2032 | if (front > req->r_reply->front.iov_len) { |
2037 | pr_warning("get_reply front %d > preallocated %d\n", | 2033 | pr_warning("get_reply front %d > preallocated %d\n", |
2038 | front, (int)req->r_reply->front.iov_len); | 2034 | front, (int)req->r_reply->front.iov_len); |
2039 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); | 2035 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); |
2040 | if (!m) | 2036 | if (!m) |
2041 | goto out; | 2037 | goto out; |
2042 | ceph_msg_put(req->r_reply); | 2038 | ceph_msg_put(req->r_reply); |
@@ -2064,7 +2060,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
2064 | #endif | 2060 | #endif |
2065 | } | 2061 | } |
2066 | *skip = 0; | 2062 | *skip = 0; |
2067 | req->r_con_filling_msg = con->ops->get(con); | 2063 | req->r_con_filling_msg = ceph_con_get(con); |
2068 | dout("get_reply tid %lld %p\n", tid, m); | 2064 | dout("get_reply tid %lld %p\n", tid, m); |
2069 | 2065 | ||
2070 | out: | 2066 | out: |
@@ -2081,11 +2077,10 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, | |||
2081 | int type = le16_to_cpu(hdr->type); | 2077 | int type = le16_to_cpu(hdr->type); |
2082 | int front = le32_to_cpu(hdr->front_len); | 2078 | int front = le32_to_cpu(hdr->front_len); |
2083 | 2079 | ||
2084 | *skip = 0; | ||
2085 | switch (type) { | 2080 | switch (type) { |
2086 | case CEPH_MSG_OSD_MAP: | 2081 | case CEPH_MSG_OSD_MAP: |
2087 | case CEPH_MSG_WATCH_NOTIFY: | 2082 | case CEPH_MSG_WATCH_NOTIFY: |
2088 | return ceph_msg_new(type, front, GFP_NOFS, false); | 2083 | return ceph_msg_new(type, front, GFP_NOFS); |
2089 | case CEPH_MSG_OSD_OPREPLY: | 2084 | case CEPH_MSG_OSD_OPREPLY: |
2090 | return get_reply(con, hdr, skip); | 2085 | return get_reply(con, hdr, skip); |
2091 | default: | 2086 | default: |
@@ -2116,32 +2111,37 @@ static void put_osd_con(struct ceph_connection *con) | |||
2116 | /* | 2111 | /* |
2117 | * authentication | 2112 | * authentication |
2118 | */ | 2113 | */ |
2119 | /* | 2114 | static int get_authorizer(struct ceph_connection *con, |
2120 | * Note: returned pointer is the address of a structure that's | 2115 | void **buf, int *len, int *proto, |
2121 | * managed separately. Caller must *not* attempt to free it. | 2116 | void **reply_buf, int *reply_len, int force_new) |
2122 | */ | ||
2123 | static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, | ||
2124 | int *proto, int force_new) | ||
2125 | { | 2117 | { |
2126 | struct ceph_osd *o = con->private; | 2118 | struct ceph_osd *o = con->private; |
2127 | struct ceph_osd_client *osdc = o->o_osdc; | 2119 | struct ceph_osd_client *osdc = o->o_osdc; |
2128 | struct ceph_auth_client *ac = osdc->client->monc.auth; | 2120 | struct ceph_auth_client *ac = osdc->client->monc.auth; |
2129 | struct ceph_auth_handshake *auth = &o->o_auth; | 2121 | int ret = 0; |
2130 | 2122 | ||
2131 | if (force_new && auth->authorizer) { | 2123 | if (force_new && o->o_authorizer) { |
2132 | if (ac->ops && ac->ops->destroy_authorizer) | 2124 | ac->ops->destroy_authorizer(ac, o->o_authorizer); |
2133 | ac->ops->destroy_authorizer(ac, auth->authorizer); | 2125 | o->o_authorizer = NULL; |
2134 | auth->authorizer = NULL; | 2126 | } |
2135 | } | 2127 | if (o->o_authorizer == NULL) { |
2136 | if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { | 2128 | ret = ac->ops->create_authorizer( |
2137 | int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, | 2129 | ac, CEPH_ENTITY_TYPE_OSD, |
2138 | auth); | 2130 | &o->o_authorizer, |
2131 | &o->o_authorizer_buf, | ||
2132 | &o->o_authorizer_buf_len, | ||
2133 | &o->o_authorizer_reply_buf, | ||
2134 | &o->o_authorizer_reply_buf_len); | ||
2139 | if (ret) | 2135 | if (ret) |
2140 | return ERR_PTR(ret); | 2136 | return ret; |
2141 | } | 2137 | } |
2142 | *proto = ac->protocol; | ||
2143 | 2138 | ||
2144 | return auth; | 2139 | *proto = ac->protocol; |
2140 | *buf = o->o_authorizer_buf; | ||
2141 | *len = o->o_authorizer_buf_len; | ||
2142 | *reply_buf = o->o_authorizer_reply_buf; | ||
2143 | *reply_len = o->o_authorizer_reply_buf_len; | ||
2144 | return 0; | ||
2145 | } | 2145 | } |
2146 | 2146 | ||
2147 | 2147 | ||
@@ -2151,11 +2151,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) | |||
2151 | struct ceph_osd_client *osdc = o->o_osdc; | 2151 | struct ceph_osd_client *osdc = o->o_osdc; |
2152 | struct ceph_auth_client *ac = osdc->client->monc.auth; | 2152 | struct ceph_auth_client *ac = osdc->client->monc.auth; |
2153 | 2153 | ||
2154 | /* | 2154 | return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); |
2155 | * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, | ||
2156 | * XXX which do we do: succeed or fail? | ||
2157 | */ | ||
2158 | return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); | ||
2159 | } | 2155 | } |
2160 | 2156 | ||
2161 | static int invalidate_authorizer(struct ceph_connection *con) | 2157 | static int invalidate_authorizer(struct ceph_connection *con) |
@@ -2164,7 +2160,7 @@ static int invalidate_authorizer(struct ceph_connection *con) | |||
2164 | struct ceph_osd_client *osdc = o->o_osdc; | 2160 | struct ceph_osd_client *osdc = o->o_osdc; |
2165 | struct ceph_auth_client *ac = osdc->client->monc.auth; | 2161 | struct ceph_auth_client *ac = osdc->client->monc.auth; |
2166 | 2162 | ||
2167 | if (ac->ops && ac->ops->invalidate_authorizer) | 2163 | if (ac->ops->invalidate_authorizer) |
2168 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); | 2164 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); |
2169 | 2165 | ||
2170 | return ceph_monc_validate_auth(&osdc->client->monc); | 2166 | return ceph_monc_validate_auth(&osdc->client->monc); |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214b5d2..fd863fe7693 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -38,7 +38,7 @@ done: | |||
38 | 38 | ||
39 | /* maps */ | 39 | /* maps */ |
40 | 40 | ||
41 | static int calc_bits_of(unsigned int t) | 41 | static int calc_bits_of(unsigned t) |
42 | { | 42 | { |
43 | int b = 0; | 43 | int b = 0; |
44 | while (t) { | 44 | while (t) { |
@@ -135,21 +135,6 @@ bad: | |||
135 | return -EINVAL; | 135 | return -EINVAL; |
136 | } | 136 | } |
137 | 137 | ||
138 | static int skip_name_map(void **p, void *end) | ||
139 | { | ||
140 | int len; | ||
141 | ceph_decode_32_safe(p, end, len ,bad); | ||
142 | while (len--) { | ||
143 | int strlen; | ||
144 | *p += sizeof(u32); | ||
145 | ceph_decode_32_safe(p, end, strlen, bad); | ||
146 | *p += strlen; | ||
147 | } | ||
148 | return 0; | ||
149 | bad: | ||
150 | return -EINVAL; | ||
151 | } | ||
152 | |||
153 | static struct crush_map *crush_decode(void *pbyval, void *end) | 138 | static struct crush_map *crush_decode(void *pbyval, void *end) |
154 | { | 139 | { |
155 | struct crush_map *c; | 140 | struct crush_map *c; |
@@ -158,7 +143,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
158 | void **p = &pbyval; | 143 | void **p = &pbyval; |
159 | void *start = pbyval; | 144 | void *start = pbyval; |
160 | u32 magic; | 145 | u32 magic; |
161 | u32 num_name_maps; | ||
162 | 146 | ||
163 | dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); | 147 | dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); |
164 | 148 | ||
@@ -166,22 +150,24 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
166 | if (c == NULL) | 150 | if (c == NULL) |
167 | return ERR_PTR(-ENOMEM); | 151 | return ERR_PTR(-ENOMEM); |
168 | 152 | ||
169 | /* set tunables to default values */ | ||
170 | c->choose_local_tries = 2; | ||
171 | c->choose_local_fallback_tries = 5; | ||
172 | c->choose_total_tries = 19; | ||
173 | |||
174 | ceph_decode_need(p, end, 4*sizeof(u32), bad); | 153 | ceph_decode_need(p, end, 4*sizeof(u32), bad); |
175 | magic = ceph_decode_32(p); | 154 | magic = ceph_decode_32(p); |
176 | if (magic != CRUSH_MAGIC) { | 155 | if (magic != CRUSH_MAGIC) { |
177 | pr_err("crush_decode magic %x != current %x\n", | 156 | pr_err("crush_decode magic %x != current %x\n", |
178 | (unsigned int)magic, (unsigned int)CRUSH_MAGIC); | 157 | (unsigned)magic, (unsigned)CRUSH_MAGIC); |
179 | goto bad; | 158 | goto bad; |
180 | } | 159 | } |
181 | c->max_buckets = ceph_decode_32(p); | 160 | c->max_buckets = ceph_decode_32(p); |
182 | c->max_rules = ceph_decode_32(p); | 161 | c->max_rules = ceph_decode_32(p); |
183 | c->max_devices = ceph_decode_32(p); | 162 | c->max_devices = ceph_decode_32(p); |
184 | 163 | ||
164 | c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); | ||
165 | if (c->device_parents == NULL) | ||
166 | goto badmem; | ||
167 | c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); | ||
168 | if (c->bucket_parents == NULL) | ||
169 | goto badmem; | ||
170 | |||
185 | c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); | 171 | c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); |
186 | if (c->buckets == NULL) | 172 | if (c->buckets == NULL) |
187 | goto badmem; | 173 | goto badmem; |
@@ -297,8 +283,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
297 | ceph_decode_32_safe(p, end, yes, bad); | 283 | ceph_decode_32_safe(p, end, yes, bad); |
298 | #if BITS_PER_LONG == 32 | 284 | #if BITS_PER_LONG == 32 |
299 | err = -EINVAL; | 285 | err = -EINVAL; |
300 | if (yes > (ULONG_MAX - sizeof(*r)) | 286 | if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) |
301 | / sizeof(struct crush_rule_step)) | ||
302 | goto bad; | 287 | goto bad; |
303 | #endif | 288 | #endif |
304 | r = c->rules[i] = kmalloc(sizeof(*r) + | 289 | r = c->rules[i] = kmalloc(sizeof(*r) + |
@@ -318,25 +303,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
318 | } | 303 | } |
319 | 304 | ||
320 | /* ignore trailing name maps. */ | 305 | /* ignore trailing name maps. */ |
321 | for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { | ||
322 | err = skip_name_map(p, end); | ||
323 | if (err < 0) | ||
324 | goto done; | ||
325 | } | ||
326 | |||
327 | /* tunables */ | ||
328 | ceph_decode_need(p, end, 3*sizeof(u32), done); | ||
329 | c->choose_local_tries = ceph_decode_32(p); | ||
330 | c->choose_local_fallback_tries = ceph_decode_32(p); | ||
331 | c->choose_total_tries = ceph_decode_32(p); | ||
332 | dout("crush decode tunable choose_local_tries = %d", | ||
333 | c->choose_local_tries); | ||
334 | dout("crush decode tunable choose_local_fallback_tries = %d", | ||
335 | c->choose_local_fallback_tries); | ||
336 | dout("crush decode tunable choose_total_tries = %d", | ||
337 | c->choose_total_tries); | ||
338 | 306 | ||
339 | done: | ||
340 | dout("crush_decode success\n"); | 307 | dout("crush_decode success\n"); |
341 | return c; | 308 | return c; |
342 | 309 | ||
@@ -469,22 +436,6 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) | |||
469 | return NULL; | 436 | return NULL; |
470 | } | 437 | } |
471 | 438 | ||
472 | const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) | ||
473 | { | ||
474 | struct ceph_pg_pool_info *pi; | ||
475 | |||
476 | if (id == CEPH_NOPOOL) | ||
477 | return NULL; | ||
478 | |||
479 | if (WARN_ON_ONCE(id > (u64) INT_MAX)) | ||
480 | return NULL; | ||
481 | |||
482 | pi = __lookup_pg_pool(&map->pg_pools, (int) id); | ||
483 | |||
484 | return pi ? pi->name : NULL; | ||
485 | } | ||
486 | EXPORT_SYMBOL(ceph_pg_pool_name_by_id); | ||
487 | |||
488 | int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) | 439 | int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) |
489 | { | 440 | { |
490 | struct rb_node *rbp; | 441 | struct rb_node *rbp; |
@@ -508,7 +459,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) | |||
508 | 459 | ||
509 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | 460 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) |
510 | { | 461 | { |
511 | unsigned int n, m; | 462 | unsigned n, m; |
512 | 463 | ||
513 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | 464 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); |
514 | calc_pg_masks(pi); | 465 | calc_pg_masks(pi); |
@@ -543,16 +494,15 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) | |||
543 | ceph_decode_32_safe(p, end, pool, bad); | 494 | ceph_decode_32_safe(p, end, pool, bad); |
544 | ceph_decode_32_safe(p, end, len, bad); | 495 | ceph_decode_32_safe(p, end, len, bad); |
545 | dout(" pool %d len %d\n", pool, len); | 496 | dout(" pool %d len %d\n", pool, len); |
546 | ceph_decode_need(p, end, len, bad); | ||
547 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 497 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
548 | if (pi) { | 498 | if (pi) { |
549 | char *name = kstrndup(*p, len, GFP_NOFS); | ||
550 | |||
551 | if (!name) | ||
552 | return -ENOMEM; | ||
553 | kfree(pi->name); | 499 | kfree(pi->name); |
554 | pi->name = name; | 500 | pi->name = kmalloc(len + 1, GFP_NOFS); |
555 | dout(" name is %s\n", pi->name); | 501 | if (pi->name) { |
502 | memcpy(pi->name, *p, len); | ||
503 | pi->name[len] = '\0'; | ||
504 | dout(" name is %s\n", pi->name); | ||
505 | } | ||
556 | } | 506 | } |
557 | *p += len; | 507 | *p += len; |
558 | } | 508 | } |
@@ -661,12 +611,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
661 | ceph_decode_32_safe(p, end, max, bad); | 611 | ceph_decode_32_safe(p, end, max, bad); |
662 | while (max--) { | 612 | while (max--) { |
663 | ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); | 613 | ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); |
664 | err = -ENOMEM; | ||
665 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | 614 | pi = kzalloc(sizeof(*pi), GFP_NOFS); |
666 | if (!pi) | 615 | if (!pi) |
667 | goto bad; | 616 | goto bad; |
668 | pi->id = ceph_decode_32(p); | 617 | pi->id = ceph_decode_32(p); |
669 | err = -EINVAL; | ||
670 | ev = ceph_decode_8(p); /* encoding version */ | 618 | ev = ceph_decode_8(p); /* encoding version */ |
671 | if (ev > CEPH_PG_POOL_VERSION) { | 619 | if (ev > CEPH_PG_POOL_VERSION) { |
672 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | 620 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", |
@@ -682,13 +630,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
682 | __insert_pg_pool(&map->pg_pools, pi); | 630 | __insert_pg_pool(&map->pg_pools, pi); |
683 | } | 631 | } |
684 | 632 | ||
685 | if (version >= 5) { | 633 | if (version >= 5 && __decode_pool_names(p, end, map) < 0) |
686 | err = __decode_pool_names(p, end, map); | 634 | goto bad; |
687 | if (err < 0) { | ||
688 | dout("fail to decode pool names"); | ||
689 | goto bad; | ||
690 | } | ||
691 | } | ||
692 | 635 | ||
693 | ceph_decode_32_safe(p, end, map->pool_max, bad); | 636 | ceph_decode_32_safe(p, end, map->pool_max, bad); |
694 | 637 | ||
@@ -729,9 +672,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
729 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); | 672 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); |
730 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | 673 | ceph_decode_copy(p, &pgid, sizeof(pgid)); |
731 | n = ceph_decode_32(p); | 674 | n = ceph_decode_32(p); |
732 | err = -EINVAL; | ||
733 | if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) | ||
734 | goto bad; | ||
735 | ceph_decode_need(p, end, n * sizeof(u32), bad); | 675 | ceph_decode_need(p, end, n * sizeof(u32), bad); |
736 | err = -ENOMEM; | 676 | err = -ENOMEM; |
737 | pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); | 677 | pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); |
@@ -768,7 +708,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
768 | return map; | 708 | return map; |
769 | 709 | ||
770 | bad: | 710 | bad: |
771 | dout("osdmap_decode fail err %d\n", err); | 711 | dout("osdmap_decode fail\n"); |
772 | ceph_osdmap_destroy(map); | 712 | ceph_osdmap_destroy(map); |
773 | return ERR_PTR(err); | 713 | return ERR_PTR(err); |
774 | } | 714 | } |
@@ -862,7 +802,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
862 | if (ev > CEPH_PG_POOL_VERSION) { | 802 | if (ev > CEPH_PG_POOL_VERSION) { |
863 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | 803 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", |
864 | ev, CEPH_PG_POOL_VERSION); | 804 | ev, CEPH_PG_POOL_VERSION); |
865 | err = -EINVAL; | ||
866 | goto bad; | 805 | goto bad; |
867 | } | 806 | } |
868 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 807 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
@@ -879,11 +818,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
879 | if (err < 0) | 818 | if (err < 0) |
880 | goto bad; | 819 | goto bad; |
881 | } | 820 | } |
882 | if (version >= 5) { | 821 | if (version >= 5 && __decode_pool_names(p, end, map) < 0) |
883 | err = __decode_pool_names(p, end, map); | 822 | goto bad; |
884 | if (err < 0) | ||
885 | goto bad; | ||
886 | } | ||
887 | 823 | ||
888 | /* old_pool */ | 824 | /* old_pool */ |
889 | ceph_decode_32_safe(p, end, len, bad); | 825 | ceph_decode_32_safe(p, end, len, bad); |
@@ -953,19 +889,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
953 | pglen = ceph_decode_32(p); | 889 | pglen = ceph_decode_32(p); |
954 | 890 | ||
955 | if (pglen) { | 891 | if (pglen) { |
956 | ceph_decode_need(p, end, pglen*sizeof(u32), bad); | ||
957 | |||
958 | /* removing existing (if any) */ | ||
959 | (void) __remove_pg_mapping(&map->pg_temp, pgid); | ||
960 | |||
961 | /* insert */ | 892 | /* insert */ |
962 | err = -EINVAL; | 893 | ceph_decode_need(p, end, pglen*sizeof(u32), bad); |
963 | if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) | ||
964 | goto bad; | ||
965 | err = -ENOMEM; | ||
966 | pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); | 894 | pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); |
967 | if (!pg) | 895 | if (!pg) { |
896 | err = -ENOMEM; | ||
968 | goto bad; | 897 | goto bad; |
898 | } | ||
969 | pg->pgid = pgid; | 899 | pg->pgid = pgid; |
970 | pg->len = pglen; | 900 | pg->len = pglen; |
971 | for (j = 0; j < pglen; j++) | 901 | for (j = 0; j < pglen; j++) |
@@ -1009,7 +939,7 @@ bad: | |||
1009 | * for now, we write only a single su, until we can | 939 | * for now, we write only a single su, until we can |
1010 | * pass a stride back to the caller. | 940 | * pass a stride back to the caller. |
1011 | */ | 941 | */ |
1012 | int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | 942 | void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, |
1013 | u64 off, u64 *plen, | 943 | u64 off, u64 *plen, |
1014 | u64 *ono, | 944 | u64 *ono, |
1015 | u64 *oxoff, u64 *oxlen) | 945 | u64 *oxoff, u64 *oxlen) |
@@ -1023,17 +953,11 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
1023 | 953 | ||
1024 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, | 954 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, |
1025 | osize, su); | 955 | osize, su); |
1026 | if (su == 0 || sc == 0) | ||
1027 | goto invalid; | ||
1028 | su_per_object = osize / su; | 956 | su_per_object = osize / su; |
1029 | if (su_per_object == 0) | ||
1030 | goto invalid; | ||
1031 | dout("osize %u / su %u = su_per_object %u\n", osize, su, | 957 | dout("osize %u / su %u = su_per_object %u\n", osize, su, |
1032 | su_per_object); | 958 | su_per_object); |
1033 | 959 | ||
1034 | if ((su & ~PAGE_MASK) != 0) | 960 | BUG_ON((su & ~PAGE_MASK) != 0); |
1035 | goto invalid; | ||
1036 | |||
1037 | /* bl = *off / su; */ | 961 | /* bl = *off / su; */ |
1038 | t = off; | 962 | t = off; |
1039 | do_div(t, su); | 963 | do_div(t, su); |
@@ -1045,7 +969,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
1045 | objsetno = stripeno / su_per_object; | 969 | objsetno = stripeno / su_per_object; |
1046 | 970 | ||
1047 | *ono = objsetno * sc + stripepos; | 971 | *ono = objsetno * sc + stripepos; |
1048 | dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); | 972 | dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); |
1049 | 973 | ||
1050 | /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ | 974 | /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ |
1051 | t = off; | 975 | t = off; |
@@ -1061,14 +985,6 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
1061 | *plen = *oxlen; | 985 | *plen = *oxlen; |
1062 | 986 | ||
1063 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); | 987 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); |
1064 | return 0; | ||
1065 | |||
1066 | invalid: | ||
1067 | dout(" invalid layout\n"); | ||
1068 | *ono = 0; | ||
1069 | *oxoff = 0; | ||
1070 | *oxlen = 0; | ||
1071 | return -EINVAL; | ||
1072 | } | 988 | } |
1073 | EXPORT_SYMBOL(ceph_calc_file_object_mapping); | 989 | EXPORT_SYMBOL(ceph_calc_file_object_mapping); |
1074 | 990 | ||
@@ -1081,11 +997,12 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, | |||
1081 | struct ceph_file_layout *fl, | 997 | struct ceph_file_layout *fl, |
1082 | struct ceph_osdmap *osdmap) | 998 | struct ceph_osdmap *osdmap) |
1083 | { | 999 | { |
1084 | unsigned int num, num_mask; | 1000 | unsigned num, num_mask; |
1085 | struct ceph_pg pgid; | 1001 | struct ceph_pg pgid; |
1002 | s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); | ||
1086 | int poolid = le32_to_cpu(fl->fl_pg_pool); | 1003 | int poolid = le32_to_cpu(fl->fl_pg_pool); |
1087 | struct ceph_pg_pool_info *pool; | 1004 | struct ceph_pg_pool_info *pool; |
1088 | unsigned int ps; | 1005 | unsigned ps; |
1089 | 1006 | ||
1090 | BUG_ON(!osdmap); | 1007 | BUG_ON(!osdmap); |
1091 | 1008 | ||
@@ -1093,13 +1010,23 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, | |||
1093 | if (!pool) | 1010 | if (!pool) |
1094 | return -EIO; | 1011 | return -EIO; |
1095 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); | 1012 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); |
1096 | num = le32_to_cpu(pool->v.pg_num); | 1013 | if (preferred >= 0) { |
1097 | num_mask = pool->pg_num_mask; | 1014 | ps += preferred; |
1015 | num = le32_to_cpu(pool->v.lpg_num); | ||
1016 | num_mask = pool->lpg_num_mask; | ||
1017 | } else { | ||
1018 | num = le32_to_cpu(pool->v.pg_num); | ||
1019 | num_mask = pool->pg_num_mask; | ||
1020 | } | ||
1098 | 1021 | ||
1099 | pgid.ps = cpu_to_le16(ps); | 1022 | pgid.ps = cpu_to_le16(ps); |
1100 | pgid.preferred = cpu_to_le16(-1); | 1023 | pgid.preferred = cpu_to_le16(preferred); |
1101 | pgid.pool = fl->fl_pg_pool; | 1024 | pgid.pool = fl->fl_pg_pool; |
1102 | dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); | 1025 | if (preferred >= 0) |
1026 | dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, | ||
1027 | (int)preferred); | ||
1028 | else | ||
1029 | dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); | ||
1103 | 1030 | ||
1104 | ol->ol_pgid = pgid; | 1031 | ol->ol_pgid = pgid; |
1105 | ol->ol_stripe_unit = fl->fl_object_stripe_unit; | 1032 | ol->ol_stripe_unit = fl->fl_object_stripe_unit; |
@@ -1117,18 +1044,24 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
1117 | struct ceph_pg_mapping *pg; | 1044 | struct ceph_pg_mapping *pg; |
1118 | struct ceph_pg_pool_info *pool; | 1045 | struct ceph_pg_pool_info *pool; |
1119 | int ruleno; | 1046 | int ruleno; |
1120 | unsigned int poolid, ps, pps, t, r; | 1047 | unsigned poolid, ps, pps, t; |
1048 | int preferred; | ||
1121 | 1049 | ||
1122 | poolid = le32_to_cpu(pgid.pool); | 1050 | poolid = le32_to_cpu(pgid.pool); |
1123 | ps = le16_to_cpu(pgid.ps); | 1051 | ps = le16_to_cpu(pgid.ps); |
1052 | preferred = (s16)le16_to_cpu(pgid.preferred); | ||
1124 | 1053 | ||
1125 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | 1054 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); |
1126 | if (!pool) | 1055 | if (!pool) |
1127 | return NULL; | 1056 | return NULL; |
1128 | 1057 | ||
1129 | /* pg_temp? */ | 1058 | /* pg_temp? */ |
1130 | t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), | 1059 | if (preferred >= 0) |
1131 | pool->pgp_num_mask); | 1060 | t = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpg_num), |
1061 | pool->lpgp_num_mask); | ||
1062 | else | ||
1063 | t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), | ||
1064 | pool->pgp_num_mask); | ||
1132 | pgid.ps = cpu_to_le16(t); | 1065 | pgid.ps = cpu_to_le16(t); |
1133 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); | 1066 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); |
1134 | if (pg) { | 1067 | if (pg) { |
@@ -1146,20 +1079,23 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
1146 | return NULL; | 1079 | return NULL; |
1147 | } | 1080 | } |
1148 | 1081 | ||
1149 | pps = ceph_stable_mod(ps, | 1082 | /* don't forcefeed bad device ids to crush */ |
1150 | le32_to_cpu(pool->v.pgp_num), | 1083 | if (preferred >= osdmap->max_osd || |
1151 | pool->pgp_num_mask); | 1084 | preferred >= osdmap->crush->max_devices) |
1085 | preferred = -1; | ||
1086 | |||
1087 | if (preferred >= 0) | ||
1088 | pps = ceph_stable_mod(ps, | ||
1089 | le32_to_cpu(pool->v.lpgp_num), | ||
1090 | pool->lpgp_num_mask); | ||
1091 | else | ||
1092 | pps = ceph_stable_mod(ps, | ||
1093 | le32_to_cpu(pool->v.pgp_num), | ||
1094 | pool->pgp_num_mask); | ||
1152 | pps += poolid; | 1095 | pps += poolid; |
1153 | r = crush_do_rule(osdmap->crush, ruleno, pps, osds, | 1096 | *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, |
1154 | min_t(int, pool->v.size, *num), | 1097 | min_t(int, pool->v.size, *num), |
1155 | osdmap->osd_weight); | 1098 | preferred, osdmap->osd_weight); |
1156 | if (r < 0) { | ||
1157 | pr_err("error %d from crush rule: pool %d ruleset %d type %d" | ||
1158 | " size %d\n", r, poolid, pool->v.crush_ruleset, | ||
1159 | pool->v.type, pool->v.size); | ||
1160 | return NULL; | ||
1161 | } | ||
1162 | *num = r; | ||
1163 | return osds; | 1099 | return osds; |
1164 | } | 1100 | } |
1165 | 1101 | ||
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 92866bebb65..13cb409a7bb 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c | |||
@@ -1,3 +1,4 @@ | |||
1 | |||
1 | #include <linux/module.h> | 2 | #include <linux/module.h> |
2 | #include <linux/gfp.h> | 3 | #include <linux/gfp.h> |
3 | #include <linux/pagemap.h> | 4 | #include <linux/pagemap.h> |
@@ -71,7 +72,8 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) | |||
71 | } | 72 | } |
72 | EXPORT_SYMBOL(ceph_pagelist_append); | 73 | EXPORT_SYMBOL(ceph_pagelist_append); |
73 | 74 | ||
74 | /* Allocate enough pages for a pagelist to append the given amount | 75 | /** |
76 | * Allocate enough pages for a pagelist to append the given amount | ||
75 | * of data without without allocating. | 77 | * of data without without allocating. |
76 | * Returns: 0 on success, -ENOMEM on error. | 78 | * Returns: 0 on success, -ENOMEM on error. |
77 | */ | 79 | */ |
@@ -93,7 +95,9 @@ int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) | |||
93 | } | 95 | } |
94 | EXPORT_SYMBOL(ceph_pagelist_reserve); | 96 | EXPORT_SYMBOL(ceph_pagelist_reserve); |
95 | 97 | ||
96 | /* Free any pages that have been preallocated. */ | 98 | /** |
99 | * Free any pages that have been preallocated. | ||
100 | */ | ||
97 | int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) | 101 | int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) |
98 | { | 102 | { |
99 | while (!list_empty(&pl->free_list)) { | 103 | while (!list_empty(&pl->free_list)) { |
@@ -108,7 +112,9 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) | |||
108 | } | 112 | } |
109 | EXPORT_SYMBOL(ceph_pagelist_free_reserve); | 113 | EXPORT_SYMBOL(ceph_pagelist_free_reserve); |
110 | 114 | ||
111 | /* Create a truncation point. */ | 115 | /** |
116 | * Create a truncation point. | ||
117 | */ | ||
112 | void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, | 118 | void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, |
113 | struct ceph_pagelist_cursor *c) | 119 | struct ceph_pagelist_cursor *c) |
114 | { | 120 | { |
@@ -118,7 +124,8 @@ void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, | |||
118 | } | 124 | } |
119 | EXPORT_SYMBOL(ceph_pagelist_set_cursor); | 125 | EXPORT_SYMBOL(ceph_pagelist_set_cursor); |
120 | 126 | ||
121 | /* Truncate a pagelist to the given point. Move extra pages to reserve. | 127 | /** |
128 | * Truncate a pagelist to the given point. Move extra pages to reserve. | ||
122 | * This won't sleep. | 129 | * This won't sleep. |
123 | * Returns: 0 on success, | 130 | * Returns: 0 on success, |
124 | * -EINVAL if the pagelist doesn't match the trunc point pagelist | 131 | * -EINVAL if the pagelist doesn't match the trunc point pagelist |
@@ -133,8 +140,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl, | |||
133 | ceph_pagelist_unmap_tail(pl); | 140 | ceph_pagelist_unmap_tail(pl); |
134 | while (pl->head.prev != c->page_lru) { | 141 | while (pl->head.prev != c->page_lru) { |
135 | page = list_entry(pl->head.prev, struct page, lru); | 142 | page = list_entry(pl->head.prev, struct page, lru); |
136 | /* move from pagelist to reserve */ | 143 | list_del(&page->lru); /* remove from pagelist */ |
137 | list_move_tail(&page->lru, &pl->free_list); | 144 | list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ |
138 | ++pl->num_pages_free; | 145 | ++pl->num_pages_free; |
139 | } | 146 | } |
140 | pl->room = c->room; | 147 | pl->room = c->room; |