summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSowmini Varadhan <sowmini.varadhan@oracle.com>2017-11-30 14:11:28 -0500
committerDavid S. Miller <davem@davemloft.net>2017-12-01 15:25:15 -0500
commit681648e67d43cf269c5590ecf021ed481f4551fc (patch)
tree834b2c6ccc03c66ce9c4cd98c30a2efc7578f5c0
parent2d746c93b6e55d34a98c8983b30d991707a2059b (diff)
rds: tcp: correctly sequence cleanup on netns deletion.
Commit 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net") introduces a regression in rds-tcp netns cleanup. The cleanup_net(), (and thus rds_tcp_dev_event notification) is only called from put_net() when all netns refcounts go to 0, but this cannot happen if the rds_connection itself is holding a c_net ref that it expects to release in rds_tcp_kill_sock. Instead, the rds_tcp_kill_sock callback should make sure to tear down state carefully, ensuring that the socket teardown is only done after all data-structures and workqs that depend on it are quiesced. The original motivation for commit 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net") was to resolve a race condition reported by syzkaller where workqs for tx/rx/connect were triggered after the namespace was deleted. Those worker threads should have been cancelled/flushed before socket tear-down and indeed, rds_conn_path_destroy() does try to sequence this by doing /* cancel cp_send_w */ /* cancel cp_recv_w */ /* flush cp_down_w */ /* free data structures */ Here the "flush cp_down_w" will trigger rds_conn_shutdown and thus invoke rds_tcp_conn_path_shutdown() to close the tcp socket, so that we ought to have satisfied the requirement that "socket-close is done after all other dependent state is quiesced". However, rds_conn_shutdown has a bug in that it *always* triggers the reconnect workq (and if connection is successful, we always restart tx/rx workqs so with the right timing, we risk the race conditions reported by syzkaller). Netns deletion is like module teardown- no need to restart a reconnect in this case. We can use the c_destroy_in_prog bit to avoid restarting the reconnect. Fixes: 8edc3affc077 ("rds: tcp: Take explicit refcounts on struct net") Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/rds/connection.c3
-rw-r--r--net/rds/rds.h6
-rw-r--r--net/rds/tcp.c4
3 files changed, 7 insertions, 6 deletions
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7ee2d5d68b78..9efc82c665b5 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
366 * to the conn hash, so we never trigger a reconnect on this 366 * to the conn hash, so we never trigger a reconnect on this
367 * conn - the reconnect is always triggered by the active peer. */ 367 * conn - the reconnect is always triggered by the active peer. */
368 cancel_delayed_work_sync(&cp->cp_conn_w); 368 cancel_delayed_work_sync(&cp->cp_conn_w);
369 if (conn->c_destroy_in_prog)
370 return;
369 rcu_read_lock(); 371 rcu_read_lock();
370 if (!hlist_unhashed(&conn->c_hash_node)) { 372 if (!hlist_unhashed(&conn->c_hash_node)) {
371 rcu_read_unlock(); 373 rcu_read_unlock();
@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn)
445 */ 447 */
446 rds_cong_remove_conn(conn); 448 rds_cong_remove_conn(conn);
447 449
448 put_net(conn->c_net);
449 kfree(conn->c_path); 450 kfree(conn->c_path);
450 kmem_cache_free(rds_conn_slab, conn); 451 kmem_cache_free(rds_conn_slab, conn);
451 452
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c349c71babff..d09f6c1facb4 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -150,7 +150,7 @@ struct rds_connection {
150 150
151 /* Protocol version */ 151 /* Protocol version */
152 unsigned int c_version; 152 unsigned int c_version;
153 struct net *c_net; 153 possible_net_t c_net;
154 154
155 struct list_head c_map_item; 155 struct list_head c_map_item;
156 unsigned long c_map_queued; 156 unsigned long c_map_queued;
@@ -165,13 +165,13 @@ struct rds_connection {
165static inline 165static inline
166struct net *rds_conn_net(struct rds_connection *conn) 166struct net *rds_conn_net(struct rds_connection *conn)
167{ 167{
168 return conn->c_net; 168 return read_pnet(&conn->c_net);
169} 169}
170 170
171static inline 171static inline
172void rds_conn_net_set(struct rds_connection *conn, struct net *net) 172void rds_conn_net_set(struct rds_connection *conn, struct net *net)
173{ 173{
174 conn->c_net = get_net(net); 174 write_pnet(&conn->c_net, net);
175} 175}
176 176
177#define RDS_FLAG_CONG_BITMAP 0x01 177#define RDS_FLAG_CONG_BITMAP 0x01
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 222cc530e5b5..f580f72ae69e 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -506,7 +506,7 @@ static void rds_tcp_kill_sock(struct net *net)
506 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); 506 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
507 spin_lock_irq(&rds_tcp_conn_lock); 507 spin_lock_irq(&rds_tcp_conn_lock);
508 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 508 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
509 struct net *c_net = tc->t_cpath->cp_conn->c_net; 509 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
510 510
511 if (net != c_net || !tc->t_sock) 511 if (net != c_net || !tc->t_sock)
512 continue; 512 continue;
@@ -563,7 +563,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
563 563
564 spin_lock_irq(&rds_tcp_conn_lock); 564 spin_lock_irq(&rds_tcp_conn_lock);
565 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 565 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
566 struct net *c_net = tc->t_cpath->cp_conn->c_net; 566 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
567 567
568 if (net != c_net || !tc->t_sock) 568 if (net != c_net || !tc->t_sock)
569 continue; 569 continue;