diff options
author | Sowmini Varadhan <sowmini.varadhan@oracle.com> | 2017-11-30 14:11:28 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-12-01 15:25:15 -0500 |
commit | 681648e67d43cf269c5590ecf021ed481f4551fc (patch) | |
tree | 834b2c6ccc03c66ce9c4cd98c30a2efc7578f5c0 | |
parent | 2d746c93b6e55d34a98c8983b30d991707a2059b (diff) |
rds: tcp: correctly sequence cleanup on netns deletion.
Commit 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net")
introduces a regression in rds-tcp netns cleanup. The cleanup_net(),
(and thus rds_tcp_dev_event notification) is only called from put_net()
when all netns refcounts go to 0, but this cannot happen if the
rds_connection itself is holding a c_net ref that it expects to
release in rds_tcp_kill_sock.
Instead, the rds_tcp_kill_sock callback should make sure to
tear down state carefully, ensuring that the socket teardown
is only done after all data-structures and workqs that depend
on it are quiesced.
The original motivation for commit 8edc3affc077 ("rds: tcp: Take explicit
refcounts on struct net") was to resolve a race condition reported by
syzkaller where workqs for tx/rx/connect were triggered after the
namespace was deleted. Those worker threads should have been
cancelled/flushed before socket tear-down and indeed,
rds_conn_path_destroy() does try to sequence this by doing
/* cancel cp_send_w */
/* cancel cp_recv_w */
/* flush cp_down_w */
/* free data structures */
Here the "flush cp_down_w" will trigger rds_conn_shutdown and thus
invoke rds_tcp_conn_path_shutdown() to close the tcp socket, so that
we ought to have satisfied the requirement that "socket-close is
done after all other dependent state is quiesced". However,
rds_conn_shutdown has a bug in that it *always* triggers the reconnect
workq (and if connection is successful, we always restart tx/rx
workqs so with the right timing, we risk the race conditions reported
by syzkaller).
Netns deletion is like module teardown- no need to restart a
reconnect in this case. We can use the c_destroy_in_prog bit
to avoid restarting the reconnect.
Fixes: 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net")
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/rds/connection.c | 3 | ||||
-rw-r--r-- | net/rds/rds.h | 6 | ||||
-rw-r--r-- | net/rds/tcp.c | 4 |
3 files changed, 7 insertions, 6 deletions
diff --git a/net/rds/connection.c b/net/rds/connection.c index 7ee2d5d68b78..9efc82c665b5 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c | |||
@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp) | |||
366 | * to the conn hash, so we never trigger a reconnect on this | 366 | * to the conn hash, so we never trigger a reconnect on this |
367 | * conn - the reconnect is always triggered by the active peer. */ | 367 | * conn - the reconnect is always triggered by the active peer. */ |
368 | cancel_delayed_work_sync(&cp->cp_conn_w); | 368 | cancel_delayed_work_sync(&cp->cp_conn_w); |
369 | if (conn->c_destroy_in_prog) | ||
370 | return; | ||
369 | rcu_read_lock(); | 371 | rcu_read_lock(); |
370 | if (!hlist_unhashed(&conn->c_hash_node)) { | 372 | if (!hlist_unhashed(&conn->c_hash_node)) { |
371 | rcu_read_unlock(); | 373 | rcu_read_unlock(); |
@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn) | |||
445 | */ | 447 | */ |
446 | rds_cong_remove_conn(conn); | 448 | rds_cong_remove_conn(conn); |
447 | 449 | ||
448 | put_net(conn->c_net); | ||
449 | kfree(conn->c_path); | 450 | kfree(conn->c_path); |
450 | kmem_cache_free(rds_conn_slab, conn); | 451 | kmem_cache_free(rds_conn_slab, conn); |
451 | 452 | ||
diff --git a/net/rds/rds.h b/net/rds/rds.h index c349c71babff..d09f6c1facb4 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h | |||
@@ -150,7 +150,7 @@ struct rds_connection { | |||
150 | 150 | ||
151 | /* Protocol version */ | 151 | /* Protocol version */ |
152 | unsigned int c_version; | 152 | unsigned int c_version; |
153 | struct net *c_net; | 153 | possible_net_t c_net; |
154 | 154 | ||
155 | struct list_head c_map_item; | 155 | struct list_head c_map_item; |
156 | unsigned long c_map_queued; | 156 | unsigned long c_map_queued; |
@@ -165,13 +165,13 @@ struct rds_connection { | |||
165 | static inline | 165 | static inline |
166 | struct net *rds_conn_net(struct rds_connection *conn) | 166 | struct net *rds_conn_net(struct rds_connection *conn) |
167 | { | 167 | { |
168 | return conn->c_net; | 168 | return read_pnet(&conn->c_net); |
169 | } | 169 | } |
170 | 170 | ||
171 | static inline | 171 | static inline |
172 | void rds_conn_net_set(struct rds_connection *conn, struct net *net) | 172 | void rds_conn_net_set(struct rds_connection *conn, struct net *net) |
173 | { | 173 | { |
174 | conn->c_net = get_net(net); | 174 | write_pnet(&conn->c_net, net); |
175 | } | 175 | } |
176 | 176 | ||
177 | #define RDS_FLAG_CONG_BITMAP 0x01 | 177 | #define RDS_FLAG_CONG_BITMAP 0x01 |
diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 222cc530e5b5..f580f72ae69e 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c | |||
@@ -506,7 +506,7 @@ static void rds_tcp_kill_sock(struct net *net) | |||
506 | rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); | 506 | rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); |
507 | spin_lock_irq(&rds_tcp_conn_lock); | 507 | spin_lock_irq(&rds_tcp_conn_lock); |
508 | list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { | 508 | list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { |
509 | struct net *c_net = tc->t_cpath->cp_conn->c_net; | 509 | struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); |
510 | 510 | ||
511 | if (net != c_net || !tc->t_sock) | 511 | if (net != c_net || !tc->t_sock) |
512 | continue; | 512 | continue; |
@@ -563,7 +563,7 @@ static void rds_tcp_sysctl_reset(struct net *net) | |||
563 | 563 | ||
564 | spin_lock_irq(&rds_tcp_conn_lock); | 564 | spin_lock_irq(&rds_tcp_conn_lock); |
565 | list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { | 565 | list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { |
566 | struct net *c_net = tc->t_cpath->cp_conn->c_net; | 566 | struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); |
567 | 567 | ||
568 | if (net != c_net || !tc->t_sock) | 568 | if (net != c_net || !tc->t_sock) |
569 | continue; | 569 | continue; |