tcp: Revert 'process defer accept as established' changes.

This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87 ("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38 ("tcp: Fix slab corruption with ipv6 and tcp6fuzz"). This change causes several problems, first reported by Ingo Molnar as a distcc-over-loopback regression where connections were getting stuck. Ilpo Järvinen first spotted the locking problems. The new function added by this code, tcp_defer_accept_check(), only has the child socket locked, yet it is modifying state of the parent listening socket. Fixing that is non-trivial at best, because we can't simply just grab the parent listening socket lock at this point, because it would create an ABBA deadlock. The normal ordering is parent listening socket --> child socket, but this code path would require the reverse lock ordering. Next is a problem noticed by Vitaliy Gusev, he noted: ---------------------------------------- >--- a/net/ipv4/tcp_timer.c >+++ b/net/ipv4/tcp_timer.c >@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data) > goto death; > } > >+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { >+ tcp_send_active_reset(sk, GFP_ATOMIC); >+ goto death; Here socket sk is not attached to listening socket's request queue. tcp_done() will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should release this sk) as socket is not DEAD. Therefore socket sk will be lost for freeing. ---------------------------------------- Finally, Alexey Kuznetsov argues that there might not even be any real value or advantage to these new semantics even if we fix all of the bugs: ---------------------------------------- Hiding from accept() sockets with only out-of-order data only is the only thing which is impossible with old approach. Is this really so valuable? My opinion: no, this is nothing but a new loophole to consume memory without control. ---------------------------------------- So revert this thing for now. Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2008-06-12 19:31:35 -0400
committer: David S. Miller <davem@davemloft.net> 2008-06-12 19:34:35 -0400
commit: ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 (patch)
tree: 68d9c2923765e12853368e8edb27b241142e0c48 /net/ipv4
parent: f23d60de719e639690b2dc5c2d0e4243ff614b7a (diff)
6 files changed, 31 insertions, 88 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 828ea211ff21..045e799d3e1d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
        struct inet_connection_sock *icsk = inet_csk(parent);
        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct listen_sock *lopt = queue->listen_opt;
-        int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+        int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+        int thresh = max_retries;
        unsigned long now = jiffies;
        struct request_sock **reqp, *req;
        int i, budget;
@@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                }
        }
+        if (queue->rskq_defer_accept)
+                max_retries = queue->rskq_defer_accept;
        budget = 2 * (lopt->nr_table_entries / (timeout / interval));
        i = lopt->clock_hand;
@@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                reqp=&lopt->syn_table[i];
                while ((req = *reqp) != NULL) {
                        if (time_after_eq(now, req->expires)) {
-                                if (req->retrans < thresh &&
+                                if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) &&
-                                    !req->rsk_ops->rtx_syn_ack(parent, req)) {
+                                    (inet_rsk(req)->acked ||
+                                     !req->rsk_ops->rtx_syn_ack(parent, req))) {
                                        unsigned long timeo;
                                        if (req->retrans++ == 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ab66683b8043..fc54a48fde1e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                break;
        case TCP_DEFER_ACCEPT:
-                if (val < 0) {
+                icsk->icsk_accept_queue.rskq_defer_accept = 0;
-                        err = -EINVAL;
+                if (val > 0) {
-                } else {
+                        /* Translate value in seconds to number of
-                        if (val > MAX_TCP_ACCEPT_DEFERRED)
+                         * retransmits */
-                                val = MAX_TCP_ACCEPT_DEFERRED;
+                        while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
-                        icsk->icsk_accept_queue.rskq_defer_accept = val;
+                               val > ((TCP_TIMEOUT_INIT / HZ) <<
+                                       icsk->icsk_accept_queue.rskq_defer_accept))
+                                icsk->icsk_accept_queue.rskq_defer_accept++;
+                        icsk->icsk_accept_queue.rskq_defer_accept++;
                }
                break;
@@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
                break;
        case TCP_DEFER_ACCEPT:
-                val = icsk->icsk_accept_queue.rskq_defer_accept;
+                val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+                        ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
                break;
        case TCP_WINDOW_CLAMP:
                val = tp->window_clamp;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eba873e9b560..cad73b7dfef0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
        }
 }
-static int tcp_defer_accept_check(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        if (tp->defer_tcp_accept.request) {
-                int queued_data =  tp->rcv_nxt - tp->copied_seq;
-                int hasfin =  !skb_queue_empty(&sk->sk_receive_queue) ?
-                        tcp_hdr((struct sk_buff *)
-                                sk->sk_receive_queue.prev)->fin : 0;
-                if (queued_data && hasfin)
-                        queued_data--;
-                if (queued_data &&
-                    tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
-                        if (sock_flag(sk, SOCK_KEEPOPEN)) {
-                                inet_csk_reset_keepalive_timer(sk,
-                                                               keepalive_time_when(tp));
-                        } else {
-                                inet_csk_delete_keepalive_timer(sk);
-                        }
-                        inet_csk_reqsk_queue_add(
-                                tp->defer_tcp_accept.listen_sk,
-                                tp->defer_tcp_accept.request,
-                                sk);
-                        tp->defer_tcp_accept.listen_sk->sk_data_ready(
-                                tp->defer_tcp_accept.listen_sk, 0);
-                        sock_put(tp->defer_tcp_accept.listen_sk);
-                        sock_put(sk);
-                        tp->defer_tcp_accept.listen_sk = NULL;
-                        tp->defer_tcp_accept.request = NULL;
-                } else if (hasfin ||
-                           tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) {
-                        tcp_reset(sk);
-                        return -1;
-                }
-        }
-        return 0;
-}
 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -4944,8 +4901,6 @@ step5:
        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
-        tcp_defer_accept_check(sk);
        return 0;
 csum_error:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4f8485c67d1a..97a230026e13 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1918,14 +1918,6 @@ int tcp_v4_destroy_sock(struct sock *sk)
                sk->sk_sndmsg_page = NULL;
        }
-        if (tp->defer_tcp_accept.request) {
-                reqsk_free(tp->defer_tcp_accept.request);
-                sock_put(tp->defer_tcp_accept.listen_sk);
-                sock_put(sk);
-                tp->defer_tcp_accept.listen_sk = NULL;
-                tp->defer_tcp_accept.request = NULL;
-        }
        atomic_dec(&tcp_sockets_allocated);
        return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 019c8c16e5cc..8245247a6ceb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
           does sequence test, SYN is truncated, and thus we consider
           it a bare ACK.
-           Both ends (listening sockets) accept the new incoming
+           If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
-           connection and try to talk to each other. 8-)
+           bare ACK.  Otherwise, we create an established connection.  Both
+           ends (listening sockets) accept the new incoming connection and try
+           to talk to each other. 8-)
           Note: This case is both harmless, and rare.  Possibility is about the
           same as us discovering intelligent life on another plant tomorrow.
@@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                if (!(flg & TCP_FLAG_ACK))
                        return NULL;
+                /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+                if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+                    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+                        inet_rsk(req)->acked = 1;
+                        return NULL;
+                }
                /* OK, ACK is valid, create big socket and
                 * feed this segment to it. It will repeat all
                 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                inet_csk_reqsk_queue_unlink(sk, req, prev);
                inet_csk_reqsk_queue_removed(sk, req);
-                if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+                inet_csk_reqsk_queue_add(sk, req, child);
-                    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
-                        /* the accept queue handling is done is est recv slow
-                         * path so lets make sure to start there
-                         */
-                        tcp_sk(child)->pred_flags = 0;
-                        sock_hold(sk);
-                        sock_hold(child);
-                        tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
-                        tcp_sk(child)->defer_tcp_accept.request = req;
-                        inet_csk_reset_keepalive_timer(child,
-                                                       inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
-                } else {
-                        inet_csk_reqsk_queue_add(sk, req, child);
-                }
                return child;
        listen_overflow:
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4de68cf5f2aa..63ed9d6830e7 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data)
                goto death;
        }
-        if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
-                tcp_send_active_reset(sk, GFP_ATOMIC);
-                goto death;
-        }
        if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
                goto out;
author	David S. Miller <davem@davemloft.net>	2008-06-12 19:31:35 -0400
committer	David S. Miller <davem@davemloft.net>	2008-06-12 19:34:35 -0400
commit	ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 (patch)
tree	68d9c2923765e12853368e8edb27b241142e0c48 /net/ipv4
parent	f23d60de719e639690b2dc5c2d0e4243ff614b7a (diff)