aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSowmini Varadhan <sowmini.varadhan@oracle.com>2016-11-16 16:29:50 -0500
committerDavid S. Miller <davem@davemloft.net>2016-11-17 13:35:18 -0500
commit1a0e100fb2c9667cea2a7d755faaa83569942f05 (patch)
tree234b4827c3c99e6d911bd4b7ea787a1185e695df
parent905dd4184e0732de41d6ee3c7b06e0cfdd9f0aad (diff)
RDS: TCP: Force every connection to be initiated by numerically smaller IP address
When 2 RDS peers initiate an RDS-TCP connection simultaneously, there is a potential for "duelling syns" on either/both sides. See commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one()") for a description of this condition, and the arbitration logic which ensures that the numerically large IP address in the TCP connection is bound to the RDS_TCP_PORT ("canonical ordering"). The rds_connection should not be marked as RDS_CONN_UP until the arbitration logic has converged for the following reason. The sender may start transmitting RDS datagrams as soon as RDS_CONN_UP is set, and since the sender removes all datagrams from the rds_connection's cp_retrans queue based on TCP acks. If the TCP ack was sent from a tcp socket that got reset as part of duel aribitration (but before data was delivered to the receivers RDS socket layer), the sender may end up prematurely freeing the datagram, and the datagram is no longer reliably deliverable. This patch remedies that condition by making sure that, upon receipt of 3WH completion state change notification of TCP_ESTABLISHED in rds_tcp_state_change, we mark the rds_connection as RDS_CONN_UP if, and only if, the IP addresses and ports for the connection are canonically ordered. In all other cases, rds_tcp_state_change will force an rds_conn_path_drop(), and rds_queue_reconnect() on both peers will restart the connection to ensure canonical ordering. A side-effect of enforcing this condition in rds_tcp_state_change() is that rds_tcp_accept_one_path() can now be refactored for simplicity. It is also no longer possible to encounter an RDS_CONN_UP connection in the arbitration logic in rds_tcp_accept_one(). Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/rds/connection.c1
-rw-r--r--net/rds/tcp_connect.c14
-rw-r--r--net/rds/tcp_listen.c29
3 files changed, 26 insertions, 18 deletions
diff --git a/net/rds/connection.c b/net/rds/connection.c
index b86e188bde32..fe9d31c0b22d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -683,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
683 !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) 683 !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
684 queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); 684 queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
685} 685}
686EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
686 687
687void rds_conn_connect_if_down(struct rds_connection *conn) 688void rds_conn_connect_if_down(struct rds_connection *conn)
688{ 689{
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 05f61c533ed3..d6839d96d539 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
60 case TCP_SYN_RECV: 60 case TCP_SYN_RECV:
61 break; 61 break;
62 case TCP_ESTABLISHED: 62 case TCP_ESTABLISHED:
63 rds_connect_path_complete(cp, RDS_CONN_CONNECTING); 63 /* Force the peer to reconnect so that we have the
64 * TCP ports going from <smaller-ip>.<transient> to
65 * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
66 * RDS connection as RDS_CONN_UP until the reconnect,
67 * to avoid RDS datagram loss.
68 */
69 if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
70 rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
71 RDS_CONN_ERROR)) {
72 rds_conn_path_drop(cp);
73 } else {
74 rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
75 }
64 break; 76 break;
65 case TCP_CLOSE_WAIT: 77 case TCP_CLOSE_WAIT:
66 case TCP_CLOSE: 78 case TCP_CLOSE:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index c9c496844cd7..f74bab3ecdca 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
83{ 83{
84 int i; 84 int i;
85 bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); 85 bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
86 int npaths = conn->c_npaths; 86 int npaths = max_t(int, 1, conn->c_npaths);
87
88 if (npaths <= 1) {
89 struct rds_conn_path *cp = &conn->c_path[0];
90 int ret;
91
92 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
93 RDS_CONN_CONNECTING);
94 if (!ret)
95 rds_conn_path_transition(cp, RDS_CONN_ERROR,
96 RDS_CONN_CONNECTING);
97 return cp->cp_transport_data;
98 }
99 87
100 /* for mprds, paths with cp_index > 0 MUST be initiated by the peer 88 /* for mprds, all paths MUST be initiated by the peer
101 * with the smaller address. 89 * with the smaller address.
102 */ 90 */
103 if (!peer_is_smaller) 91 if (!peer_is_smaller) {
92 /* Make sure we initiate at least one path if this
93 * has not already been done; rds_start_mprds() will
94 * take care of additional paths, if necessary.
95 */
96 if (npaths == 1)
97 rds_conn_path_connect_if_down(&conn->c_path[0]);
104 return NULL; 98 return NULL;
99 }
105 100
106 for (i = 0; i < npaths; i++) { 101 for (i = 0; i < npaths; i++) {
107 struct rds_conn_path *cp = &conn->c_path[i]; 102 struct rds_conn_path *cp = &conn->c_path[i];
@@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock)
171 mutex_lock(&rs_tcp->t_conn_path_lock); 166 mutex_lock(&rs_tcp->t_conn_path_lock);
172 cp = rs_tcp->t_cpath; 167 cp = rs_tcp->t_cpath;
173 conn_state = rds_conn_path_state(cp); 168 conn_state = rds_conn_path_state(cp);
174 if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && 169 WARN_ON(conn_state == RDS_CONN_UP);
175 conn_state != RDS_CONN_ERROR) 170 if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
176 goto rst_nsk; 171 goto rst_nsk;
177 if (rs_tcp->t_sock) { 172 if (rs_tcp->t_sock) {
178 /* Need to resolve a duelling SYN between peers. 173 /* Need to resolve a duelling SYN between peers.