diff options
author | Sowmini Varadhan <sowmini.varadhan@oracle.com> | 2015-05-05 15:20:51 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-05-09 16:03:27 -0400 |
commit | f711a6ae062caeee46067b2f2f12ffda319ae73c (patch) | |
tree | 518c5041d8e081998cae22c4b43a32338f148bf7 /net | |
parent | e16e888b525503be05b3aea64190e8b3bdef44d0 (diff) |
net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection.
When running RDS over TCP, the active (client) side connects to the
listening ("passive") side at the RDS_TCP_PORT. After the connection
is established, if the client side reboots (potentially without even
sending a FIN) the server still has a TCP socket in the esablished
state. If the server-side now gets a new SYN comes from the client
with a different client port, TCP will create a new socket-pair, but
the RDS layer will incorrectly pull up the old rds_connection (which
is still associated with the stale t_sock and RDS socket state).
This patch corrects this behavior by having rds_tcp_accept_one()
always create a new connection for an incoming TCP SYN.
The rds and tcp state associated with the old socket-pair is cleaned
up via the rds_tcp_state_change() callback which would typically be
invoked in most cases when the client-TCP sends a FIN on TCP restart,
triggering a transition to CLOSE_WAIT state. In the rarer event of client
death without a FIN, TCP_KEEPALIVE probes on the socket will detect
the stale socket, and the TCP transition to CLOSE state will trigger
the RDS state cleanup.
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/rds/connection.c | 4 | ||||
-rw-r--r-- | net/rds/tcp_connect.c | 1 | ||||
-rw-r--r-- | net/rds/tcp_listen.c | 46 |
3 files changed, 51 insertions, 0 deletions
diff --git a/net/rds/connection.c b/net/rds/connection.c index 14f041398ca1..60f0cd6ed15f 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c | |||
@@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
126 | struct rds_transport *loop_trans; | 126 | struct rds_transport *loop_trans; |
127 | unsigned long flags; | 127 | unsigned long flags; |
128 | int ret; | 128 | int ret; |
129 | struct rds_transport *otrans = trans; | ||
129 | 130 | ||
131 | if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) | ||
132 | goto new_conn; | ||
130 | rcu_read_lock(); | 133 | rcu_read_lock(); |
131 | conn = rds_conn_lookup(head, laddr, faddr, trans); | 134 | conn = rds_conn_lookup(head, laddr, faddr, trans); |
132 | if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && | 135 | if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && |
@@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
142 | if (conn) | 145 | if (conn) |
143 | goto out; | 146 | goto out; |
144 | 147 | ||
148 | new_conn: | ||
145 | conn = kmem_cache_zalloc(rds_conn_slab, gfp); | 149 | conn = kmem_cache_zalloc(rds_conn_slab, gfp); |
146 | if (!conn) { | 150 | if (!conn) { |
147 | conn = ERR_PTR(-ENOMEM); | 151 | conn = ERR_PTR(-ENOMEM); |
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f9f564a6c960..973109c7b8e8 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c | |||
@@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk) | |||
62 | case TCP_ESTABLISHED: | 62 | case TCP_ESTABLISHED: |
63 | rds_connect_complete(conn); | 63 | rds_connect_complete(conn); |
64 | break; | 64 | break; |
65 | case TCP_CLOSE_WAIT: | ||
65 | case TCP_CLOSE: | 66 | case TCP_CLOSE: |
66 | rds_conn_drop(conn); | 67 | rds_conn_drop(conn); |
67 | default: | 68 | default: |
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 23ab4dcd1d9f..0da49e34495f 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c | |||
@@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work); | |||
45 | static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); | 45 | static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); |
46 | static struct socket *rds_tcp_listen_sock; | 46 | static struct socket *rds_tcp_listen_sock; |
47 | 47 | ||
48 | static int rds_tcp_keepalive(struct socket *sock) | ||
49 | { | ||
50 | /* values below based on xs_udp_default_timeout */ | ||
51 | int keepidle = 5; /* send a probe 'keepidle' secs after last data */ | ||
52 | int keepcnt = 5; /* number of unack'ed probes before declaring dead */ | ||
53 | int keepalive = 1; | ||
54 | int ret = 0; | ||
55 | |||
56 | ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, | ||
57 | (char *)&keepalive, sizeof(keepalive)); | ||
58 | if (ret < 0) | ||
59 | goto bail; | ||
60 | |||
61 | ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, | ||
62 | (char *)&keepcnt, sizeof(keepcnt)); | ||
63 | if (ret < 0) | ||
64 | goto bail; | ||
65 | |||
66 | ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, | ||
67 | (char *)&keepidle, sizeof(keepidle)); | ||
68 | if (ret < 0) | ||
69 | goto bail; | ||
70 | |||
71 | /* KEEPINTVL is the interval between successive probes. We follow | ||
72 | * the model in xs_tcp_finish_connecting() and re-use keepidle. | ||
73 | */ | ||
74 | ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, | ||
75 | (char *)&keepidle, sizeof(keepidle)); | ||
76 | bail: | ||
77 | return ret; | ||
78 | } | ||
79 | |||
48 | static int rds_tcp_accept_one(struct socket *sock) | 80 | static int rds_tcp_accept_one(struct socket *sock) |
49 | { | 81 | { |
50 | struct socket *new_sock = NULL; | 82 | struct socket *new_sock = NULL; |
51 | struct rds_connection *conn; | 83 | struct rds_connection *conn; |
52 | int ret; | 84 | int ret; |
53 | struct inet_sock *inet; | 85 | struct inet_sock *inet; |
86 | struct rds_tcp_connection *rs_tcp; | ||
54 | 87 | ||
55 | ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, | 88 | ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, |
56 | sock->sk->sk_protocol, &new_sock); | 89 | sock->sk->sk_protocol, &new_sock); |
@@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock) | |||
63 | if (ret < 0) | 96 | if (ret < 0) |
64 | goto out; | 97 | goto out; |
65 | 98 | ||
99 | ret = rds_tcp_keepalive(new_sock); | ||
100 | if (ret < 0) | ||
101 | goto out; | ||
102 | |||
66 | rds_tcp_tune(new_sock); | 103 | rds_tcp_tune(new_sock); |
67 | 104 | ||
68 | inet = inet_sk(new_sock->sk); | 105 | inet = inet_sk(new_sock->sk); |
@@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock) | |||
77 | ret = PTR_ERR(conn); | 114 | ret = PTR_ERR(conn); |
78 | goto out; | 115 | goto out; |
79 | } | 116 | } |
117 | /* An incoming SYN request came in, and TCP just accepted it. | ||
118 | * We always create a new conn for listen side of TCP, and do not | ||
119 | * add it to the c_hash_list. | ||
120 | * | ||
121 | * If the client reboots, this conn will need to be cleaned up. | ||
122 | * rds_tcp_state_change() will do that cleanup | ||
123 | */ | ||
124 | rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; | ||
125 | WARN_ON(!rs_tcp || rs_tcp->t_sock); | ||
80 | 126 | ||
81 | /* | 127 | /* |
82 | * see the comment above rds_queue_delayed_reconnect() | 128 | * see the comment above rds_queue_delayed_reconnect() |