summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorSowmini Varadhan <sowmini.varadhan@oracle.com>2015-05-05 15:20:51 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-09 16:03:27 -0400
commitf711a6ae062caeee46067b2f2f12ffda319ae73c (patch)
tree518c5041d8e081998cae22c4b43a32338f148bf7 /net
parente16e888b525503be05b3aea64190e8b3bdef44d0 (diff)
net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection.
When running RDS over TCP, the active (client) side connects to the listening ("passive") side at the RDS_TCP_PORT. After the connection is established, if the client side reboots (potentially without even sending a FIN) the server still has a TCP socket in the esablished state. If the server-side now gets a new SYN comes from the client with a different client port, TCP will create a new socket-pair, but the RDS layer will incorrectly pull up the old rds_connection (which is still associated with the stale t_sock and RDS socket state). This patch corrects this behavior by having rds_tcp_accept_one() always create a new connection for an incoming TCP SYN. The rds and tcp state associated with the old socket-pair is cleaned up via the rds_tcp_state_change() callback which would typically be invoked in most cases when the client-TCP sends a FIN on TCP restart, triggering a transition to CLOSE_WAIT state. In the rarer event of client death without a FIN, TCP_KEEPALIVE probes on the socket will detect the stale socket, and the TCP transition to CLOSE state will trigger the RDS state cleanup. Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/rds/connection.c4
-rw-r--r--net/rds/tcp_connect.c1
-rw-r--r--net/rds/tcp_listen.c46
3 files changed, 51 insertions, 0 deletions
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 14f041398ca1..60f0cd6ed15f 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
126 struct rds_transport *loop_trans; 126 struct rds_transport *loop_trans;
127 unsigned long flags; 127 unsigned long flags;
128 int ret; 128 int ret;
129 struct rds_transport *otrans = trans;
129 130
131 if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
132 goto new_conn;
130 rcu_read_lock(); 133 rcu_read_lock();
131 conn = rds_conn_lookup(head, laddr, faddr, trans); 134 conn = rds_conn_lookup(head, laddr, faddr, trans);
132 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 135 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
@@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
142 if (conn) 145 if (conn)
143 goto out; 146 goto out;
144 147
148new_conn:
145 conn = kmem_cache_zalloc(rds_conn_slab, gfp); 149 conn = kmem_cache_zalloc(rds_conn_slab, gfp);
146 if (!conn) { 150 if (!conn) {
147 conn = ERR_PTR(-ENOMEM); 151 conn = ERR_PTR(-ENOMEM);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index f9f564a6c960..973109c7b8e8 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
62 case TCP_ESTABLISHED: 62 case TCP_ESTABLISHED:
63 rds_connect_complete(conn); 63 rds_connect_complete(conn);
64 break; 64 break;
65 case TCP_CLOSE_WAIT:
65 case TCP_CLOSE: 66 case TCP_CLOSE:
66 rds_conn_drop(conn); 67 rds_conn_drop(conn);
67 default: 68 default:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 23ab4dcd1d9f..0da49e34495f 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
45static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); 45static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
46static struct socket *rds_tcp_listen_sock; 46static struct socket *rds_tcp_listen_sock;
47 47
48static int rds_tcp_keepalive(struct socket *sock)
49{
50 /* values below based on xs_udp_default_timeout */
51 int keepidle = 5; /* send a probe 'keepidle' secs after last data */
52 int keepcnt = 5; /* number of unack'ed probes before declaring dead */
53 int keepalive = 1;
54 int ret = 0;
55
56 ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
57 (char *)&keepalive, sizeof(keepalive));
58 if (ret < 0)
59 goto bail;
60
61 ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
62 (char *)&keepcnt, sizeof(keepcnt));
63 if (ret < 0)
64 goto bail;
65
66 ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
67 (char *)&keepidle, sizeof(keepidle));
68 if (ret < 0)
69 goto bail;
70
71 /* KEEPINTVL is the interval between successive probes. We follow
72 * the model in xs_tcp_finish_connecting() and re-use keepidle.
73 */
74 ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
75 (char *)&keepidle, sizeof(keepidle));
76bail:
77 return ret;
78}
79
48static int rds_tcp_accept_one(struct socket *sock) 80static int rds_tcp_accept_one(struct socket *sock)
49{ 81{
50 struct socket *new_sock = NULL; 82 struct socket *new_sock = NULL;
51 struct rds_connection *conn; 83 struct rds_connection *conn;
52 int ret; 84 int ret;
53 struct inet_sock *inet; 85 struct inet_sock *inet;
86 struct rds_tcp_connection *rs_tcp;
54 87
55 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, 88 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
56 sock->sk->sk_protocol, &new_sock); 89 sock->sk->sk_protocol, &new_sock);
@@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
63 if (ret < 0) 96 if (ret < 0)
64 goto out; 97 goto out;
65 98
99 ret = rds_tcp_keepalive(new_sock);
100 if (ret < 0)
101 goto out;
102
66 rds_tcp_tune(new_sock); 103 rds_tcp_tune(new_sock);
67 104
68 inet = inet_sk(new_sock->sk); 105 inet = inet_sk(new_sock->sk);
@@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
77 ret = PTR_ERR(conn); 114 ret = PTR_ERR(conn);
78 goto out; 115 goto out;
79 } 116 }
117 /* An incoming SYN request came in, and TCP just accepted it.
118 * We always create a new conn for listen side of TCP, and do not
119 * add it to the c_hash_list.
120 *
121 * If the client reboots, this conn will need to be cleaned up.
122 * rds_tcp_state_change() will do that cleanup
123 */
124 rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
125 WARN_ON(!rs_tcp || rs_tcp->t_sock);
80 126
81 /* 127 /*
82 * see the comment above rds_queue_delayed_reconnect() 128 * see the comment above rds_queue_delayed_reconnect()