aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph/messenger.c
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2012-07-10 14:53:34 -0400
committerSage Weil <sage@inktank.com>2012-07-30 12:29:52 -0400
commita16cb1f70799c851410d9dca0a24122e258df06c (patch)
tree0290805d3594d770da3937da7fa68841c1e294ac /net/ceph/messenger.c
parentcd43045c2de60f40a0aea49bfb252a2eafe58f8c (diff)
libceph: fix messenger retry
In ancient times, the messenger could both initiate and accept connections. An artifact if that was data structures to store/process an incoming ceph_msg_connect request and send an outgoing ceph_msg_connect_reply. Sadly, the negotiation code was referencing those structures and ignoring important information (like the peer's connect_seq) from the correct ones. Among other things, this fixes tight reconnect loops where the server sends RETRY_SESSION and we (the client) retries with the same connect_seq as last time. This bug pretty easily triggered by injecting socket failures on the MDS and running some fs workload like workunits/direct_io/test_sync_io. Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'net/ceph/messenger.c')
-rw-r--r--net/ceph/messenger.c12
1 files changed, 6 insertions, 6 deletions
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 09ada7924874..16814d1f4774 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1540,7 +1540,7 @@ static int process_connect(struct ceph_connection *con)
1540 * dropped messages. 1540 * dropped messages.
1541 */ 1541 */
1542 dout("process_connect got RESET peer seq %u\n", 1542 dout("process_connect got RESET peer seq %u\n",
1543 le32_to_cpu(con->in_connect.connect_seq)); 1543 le32_to_cpu(con->in_reply.connect_seq));
1544 pr_err("%s%lld %s connection reset\n", 1544 pr_err("%s%lld %s connection reset\n",
1545 ENTITY_NAME(con->peer_name), 1545 ENTITY_NAME(con->peer_name),
1546 ceph_pr_addr(&con->peer_addr.in_addr)); 1546 ceph_pr_addr(&con->peer_addr.in_addr));
@@ -1566,10 +1566,10 @@ static int process_connect(struct ceph_connection *con)
1566 * If we sent a smaller connect_seq than the peer has, try 1566 * If we sent a smaller connect_seq than the peer has, try
1567 * again with a larger value. 1567 * again with a larger value.
1568 */ 1568 */
1569 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", 1569 dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
1570 le32_to_cpu(con->out_connect.connect_seq), 1570 le32_to_cpu(con->out_connect.connect_seq),
1571 le32_to_cpu(con->in_connect.connect_seq)); 1571 le32_to_cpu(con->in_reply.connect_seq));
1572 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); 1572 con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
1573 ret = prepare_write_connect(con); 1573 ret = prepare_write_connect(con);
1574 if (ret < 0) 1574 if (ret < 0)
1575 return ret; 1575 return ret;
@@ -1583,9 +1583,9 @@ static int process_connect(struct ceph_connection *con)
1583 */ 1583 */
1584 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", 1584 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1585 con->peer_global_seq, 1585 con->peer_global_seq,
1586 le32_to_cpu(con->in_connect.global_seq)); 1586 le32_to_cpu(con->in_reply.global_seq));
1587 get_global_seq(con->msgr, 1587 get_global_seq(con->msgr,
1588 le32_to_cpu(con->in_connect.global_seq)); 1588 le32_to_cpu(con->in_reply.global_seq));
1589 ret = prepare_write_connect(con); 1589 ret = prepare_write_connect(con);
1590 if (ret < 0) 1590 if (ret < 0)
1591 return ret; 1591 return ret;