diff options
author | David S. Miller <davem@davemloft.net> | 2008-06-12 19:31:35 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-06-12 19:34:35 -0400 |
commit | ec0a196626bd12e0ba108d7daa6d95a4fb25c2c5 (patch) | |
tree | 68d9c2923765e12853368e8edb27b241142e0c48 /net/ipv4/tcp_minisocks.c | |
parent | f23d60de719e639690b2dc5c2d0e4243ff614b7a (diff) |
tcp: Revert 'process defer accept as established' changes.
This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87
("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and
the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38
("tcp: Fix slab corruption with ipv6 and tcp6fuzz").
This change causes several problems, first reported by Ingo Molnar
as a distcc-over-loopback regression where connections were getting
stuck.
Ilpo Järvinen first spotted the locking problems. The new function
added by this code, tcp_defer_accept_check(), only has the
child socket locked, yet it is modifying state of the parent
listening socket.
Fixing that is non-trivial at best, because we can't simply just grab
the parent listening socket lock at this point, because it would
create an ABBA deadlock. The normal ordering is parent listening
socket --> child socket, but this code path would require the
reverse lock ordering.
Next is a problem noticed by Vitaliy Gusev, he noted:
----------------------------------------
>--- a/net/ipv4/tcp_timer.c
>+++ b/net/ipv4/tcp_timer.c
>@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
> goto death;
> }
>
>+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
>+ tcp_send_active_reset(sk, GFP_ATOMIC);
>+ goto death;
Here socket sk is not attached to listening socket's request queue. tcp_done()
will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should
release this sk) as socket is not DEAD. Therefore socket sk will be lost for
freeing.
----------------------------------------
Finally, Alexey Kuznetsov argues that there might not even be any
real value or advantage to these new semantics even if we fix all
of the bugs:
----------------------------------------
Hiding from accept() sockets with only out-of-order data only
is the only thing which is impossible with old approach. Is this really
so valuable? My opinion: no, this is nothing but a new loophole
to consume memory without control.
----------------------------------------
So revert this thing for now.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 32 |
1 files changed, 12 insertions, 20 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 019c8c16e5cc..8245247a6ceb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
571 | does sequence test, SYN is truncated, and thus we consider | 571 | does sequence test, SYN is truncated, and thus we consider |
572 | it a bare ACK. | 572 | it a bare ACK. |
573 | 573 | ||
574 | Both ends (listening sockets) accept the new incoming | 574 | If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this |
575 | connection and try to talk to each other. 8-) | 575 | bare ACK. Otherwise, we create an established connection. Both |
576 | ends (listening sockets) accept the new incoming connection and try | ||
577 | to talk to each other. 8-) | ||
576 | 578 | ||
577 | Note: This case is both harmless, and rare. Possibility is about the | 579 | Note: This case is both harmless, and rare. Possibility is about the |
578 | same as us discovering intelligent life on another plant tomorrow. | 580 | same as us discovering intelligent life on another plant tomorrow. |
@@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
640 | if (!(flg & TCP_FLAG_ACK)) | 642 | if (!(flg & TCP_FLAG_ACK)) |
641 | return NULL; | 643 | return NULL; |
642 | 644 | ||
645 | /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ | ||
646 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | ||
647 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | ||
648 | inet_rsk(req)->acked = 1; | ||
649 | return NULL; | ||
650 | } | ||
651 | |||
643 | /* OK, ACK is valid, create big socket and | 652 | /* OK, ACK is valid, create big socket and |
644 | * feed this segment to it. It will repeat all | 653 | * feed this segment to it. It will repeat all |
645 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO | 654 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO |
@@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
678 | inet_csk_reqsk_queue_unlink(sk, req, prev); | 687 | inet_csk_reqsk_queue_unlink(sk, req, prev); |
679 | inet_csk_reqsk_queue_removed(sk, req); | 688 | inet_csk_reqsk_queue_removed(sk, req); |
680 | 689 | ||
681 | if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | 690 | inet_csk_reqsk_queue_add(sk, req, child); |
682 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | ||
683 | |||
684 | /* the accept queue handling is done is est recv slow | ||
685 | * path so lets make sure to start there | ||
686 | */ | ||
687 | tcp_sk(child)->pred_flags = 0; | ||
688 | sock_hold(sk); | ||
689 | sock_hold(child); | ||
690 | tcp_sk(child)->defer_tcp_accept.listen_sk = sk; | ||
691 | tcp_sk(child)->defer_tcp_accept.request = req; | ||
692 | |||
693 | inet_csk_reset_keepalive_timer(child, | ||
694 | inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); | ||
695 | } else { | ||
696 | inet_csk_reqsk_queue_add(sk, req, child); | ||
697 | } | ||
698 | |||
699 | return child; | 691 | return child; |
700 | 692 | ||
701 | listen_overflow: | 693 | listen_overflow: |